[llvm] 62d946e - GlobalISel: Merge some AMDGPU ABI lowering code to generic code

Thu Feb 18 14:27:08 PST 2021

Author: Matt Arsenault
Date: 2021-02-18T17:26:55-05:00
New Revision: 62d946e133f748d4500903c2b80fc456ff409505

URL: https://github.com/llvm/llvm-project/commit/62d946e133f748d4500903c2b80fc456ff409505
DIFF: https://github.com/llvm/llvm-project/commit/62d946e133f748d4500903c2b80fc456ff409505.diff

LOG: GlobalISel: Merge some AMDGPU ABI lowering code to generic code

AMDGPU currently has a lot of pre-processing code to pre-split
argument types into 32-bit pieces before passing it to the generic
code in handleAssignments. This is a bit sloppy and also requires some
overly fancy iterator work when building the calls. It's better if all
argument marshalling code is handled directly in
handleAssignments. This handles more situations like decomposing large
element vectors into sub-element sized pieces.

This should mostly be NFC, but does change the generated code by
shifting where the initial argument packing instructions are placed. I
think this is nicer looking, since it now emits the packing code
directly after the relevant copies, rather than after the copies for
the remaining arguments.

This doubles down on gfx6/gfx7 using the gfx8+ ABI for 16-bit
types. This is ultimately the better option, but incompatible with the
DAG. Fixing this requires more work, especially for f16.

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 0c54a090ecc7..9b7954dabb33 100644

--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -228,6 +228,119 @@ void CallLowering::unpackRegs(ArrayRef<Register> DstRegs, Register SrcReg,
     MIRBuilder.buildExtract(DstRegs[i], SrcReg, Offsets[i]);
 }
 
+/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
+static MachineInstrBuilder
+mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
+                            ArrayRef<Register> SrcRegs) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  LLT LLTy = MRI.getType(DstRegs[0]);
+  LLT PartLLT = MRI.getType(SrcRegs[0]);
+
+  // Deal with v3s16 split into v2s16
+  LLT LCMTy = getLCMType(LLTy, PartLLT);
+  if (LCMTy == LLTy) {
+    // Common case where no padding is needed.
+    assert(DstRegs.size() == 1);
+    return B.buildConcatVectors(DstRegs[0], SrcRegs);
+  }
+
+  const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
+  Register Undef = B.buildUndef(PartLLT).getReg(0);
+
+  // Build vector of undefs.
+  SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
+
+  // Replace the first sources with the real registers.
+  std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
+
+  auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
+  int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
+
+  SmallVector<Register, 8> PadDstRegs(NumDst);
+  std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
+
+  // Create the excess dead defs for the unmerge.
+  for (int I = DstRegs.size(); I != NumDst; ++I)
+    PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
+
+  return B.buildUnmerge(PadDstRegs, Widened);
+}
+
+/// Create a sequence of instructions to combine pieces split into register
+/// typed values to the original IR value. \p OrigRegs contains the destination
+/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
+/// with type \p PartLLT.
+static void buildCopyToParts(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
+                             ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  if (!LLTy.isVector() && !PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    LLT OrigTy = MRI.getType(OrigRegs[0]);
+
+    unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
+    if (SrcSize == OrigTy.getSizeInBits())
+      B.buildMerge(OrigRegs[0], Regs);
+    else {
+      auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
+      B.buildTrunc(OrigRegs[0], Widened);
+    }
+
+    return;
+  }
+
+  if (LLTy.isVector() && PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    assert(LLTy.getElementType() == PartLLT.getElementType());
+    mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
+    return;
+  }
+
+  assert(LLTy.isVector() && !PartLLT.isVector());
+
+  LLT DstEltTy = LLTy.getElementType();
+
+  // Pointer information was discarded. We'll need to coerce some register types
+  // to avoid violating type constraints.
+  LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
+
+  assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
+
+  if (DstEltTy == PartLLT) {
+    // Vector was trivially scalarized.
+
+    if (RealDstEltTy.isPointer()) {
+      for (Register Reg : Regs)
+        MRI.setType(Reg, RealDstEltTy);
+    }
+
+    B.buildBuildVector(OrigRegs[0], Regs);
+  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
+    // Deal with vector with 64-bit elements decomposed to 32-bit
+    // registers. Need to create intermediate 64-bit elements.
+    SmallVector<Register, 8> EltMerges;
+    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
+
+    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
+
+    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
+      auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
+      // Fix the type in case this is really a vector of pointers.
+      MRI.setType(Merge.getReg(0), RealDstEltTy);
+      EltMerges.push_back(Merge.getReg(0));
+      Regs = Regs.drop_front(PartsPerElt);
+    }
+
+    B.buildBuildVector(OrigRegs[0], EltMerges);
+  } else {
+    // Vector was split, and elements promoted to a wider type.
+    // FIXME: Should handle floating point promotions.
+    LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
+    auto BV = B.buildBuildVector(BVType, Regs);
+    B.buildTrunc(OrigRegs[0], BV);
+  }
+}
+
 bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
                                      SmallVectorImpl<ArgInfo> &Args,
                                      ValueHandler &Handler,
@@ -278,9 +391,6 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     }
 
     assert(NumParts > 1);
-    // For now only handle exact splits.
-    if (NewVT.getSizeInBits() * NumParts != CurVT.getSizeInBits())
-      return false;
 
     // For incoming arguments (physregs to vregs), we could have values in
     // physregs (or memlocs) which we want to extract and copy to vregs.
@@ -379,6 +489,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     EVT OrigVT = EVT::getEVT(Args[i].Ty);
     EVT VAVT = VA.getValVT();
     const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
+    const LLT VATy(VAVT.getSimpleVT());
 
     // Expected to be multiple regs for a single incoming arg.
     // There should be Regs.size() ArgLocs per argument.
@@ -427,7 +538,6 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
       }
 
       // This ArgLoc covers multiple pieces, so we need to split it.
-      const LLT VATy(VAVT.getSimpleVT());
       Register NewReg =
         MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
       Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
@@ -451,12 +561,12 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     // Now that all pieces have been handled, re-pack any arguments into any
     // wider, original registers.
     if (Handler.isIncomingArgumentHandler()) {
-      if (VAVT.getFixedSizeInBits() < OrigVT.getFixedSizeInBits()) {
-        assert(NumArgRegs >= 2);
+      // Merge the split registers into the expected larger result vregs of
+      // the original call.
 
-        // Merge the split registers into the expected larger result vreg
-        // of the original call.
-        MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
+      if (OrigTy != VATy && !Args[i].OrigRegs.empty()) {
+        buildCopyToParts(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
+                         VATy);
       }
     }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index e7a0d7e52f22..6d0df5ce53c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -735,117 +735,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   return true;
 }
 
-/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
-static MachineInstrBuilder mergeVectorRegsToResultRegs(
-  MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
-  MachineRegisterInfo &MRI = *B.getMRI();
-  LLT LLTy = MRI.getType(DstRegs[0]);
-  LLT PartLLT = MRI.getType(SrcRegs[0]);
-
-  // Deal with v3s16 split into v2s16
-  LLT LCMTy = getLCMType(LLTy, PartLLT);
-  if (LCMTy == LLTy) {
-    // Common case where no padding is needed.
-    assert(DstRegs.size() == 1);
-    return B.buildConcatVectors(DstRegs[0], SrcRegs);
-  }
-
-  const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
-  Register Undef = B.buildUndef(PartLLT).getReg(0);
-
-  // Build vector of undefs.
-  SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
-
-  // Replace the first sources with the real registers.
-  std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
-
-  auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
-  int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
-
-  SmallVector<Register, 8> PadDstRegs(NumDst);
-  std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
-
-  // Create the excess dead defs for the unmerge.
-  for (int I = DstRegs.size(); I != NumDst; ++I)
-    PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
-
-  return B.buildUnmerge(PadDstRegs, Widened);
-}
-
-// TODO: Move this to generic code
-static void packSplitRegsToOrigType(MachineIRBuilder &B,
-                                    ArrayRef<Register> OrigRegs,
-                                    ArrayRef<Register> Regs,
-                                    LLT LLTy,
-                                    LLT PartLLT) {
-  MachineRegisterInfo &MRI = *B.getMRI();
-
-  if (!LLTy.isVector() && !PartLLT.isVector()) {
-    assert(OrigRegs.size() == 1);
-    LLT OrigTy = MRI.getType(OrigRegs[0]);
-
-    unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
-    if (SrcSize == OrigTy.getSizeInBits())
-      B.buildMerge(OrigRegs[0], Regs);
-    else {
-      auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
-      B.buildTrunc(OrigRegs[0], Widened);
-    }
-
-    return;
-  }
-
-  if (LLTy.isVector() && PartLLT.isVector()) {
-    assert(OrigRegs.size() == 1);
-    assert(LLTy.getElementType() == PartLLT.getElementType());
-    mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
-    return;
-  }
-
-  assert(LLTy.isVector() && !PartLLT.isVector());
-
-  LLT DstEltTy = LLTy.getElementType();
-
-  // Pointer information was discarded. We'll need to coerce some register types
-  // to avoid violating type constraints.
-  LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
-
-  assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
-
-  if (DstEltTy == PartLLT) {
-    // Vector was trivially scalarized.
-
-    if (RealDstEltTy.isPointer()) {
-      for (Register Reg : Regs)
-        MRI.setType(Reg, RealDstEltTy);
-    }
-
-    B.buildBuildVector(OrigRegs[0], Regs);
-  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
-    // Deal with vector with 64-bit elements decomposed to 32-bit
-    // registers. Need to create intermediate 64-bit elements.
-    SmallVector<Register, 8> EltMerges;
-    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
-
-    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
-
-    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
-      auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
-      // Fix the type in case this is really a vector of pointers.
-      MRI.setType(Merge.getReg(0), RealDstEltTy);
-      EltMerges.push_back(Merge.getReg(0));
-      Regs = Regs.drop_front(PartsPerElt);
-    }
-
-    B.buildBuildVector(OrigRegs[0], EltMerges);
-  } else {
-    // Vector was split, and elements promoted to a wider type.
-    LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
-    auto BV = B.buildBuildVector(BVType, Regs);
-    B.buildTrunc(OrigRegs[0], BV);
-  }
-}
-
 bool AMDGPUCallLowering::lowerFormalArguments(
     MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
     FunctionLoweringInfo &FLI) const {
@@ -886,7 +775,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
-  SmallVector<ArgInfo, 8> SplitArg;
   SmallVector<ArgInfo, 32> SplitArgs;
   unsigned Idx = 0;
   unsigned PSInputNum = 0;
@@ -936,19 +824,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
     setArgFlags(OrigArg, OrigArgIdx, DL, F);
 
-    SplitArg.clear();
-    splitToValueTypes(B, OrigArg, SplitArg, DL, CC);
-
-    processSplitArgs(B, OrigArg, SplitArg, SplitArgs, DL, CC, false,
-                     // FIXME: We should probably be passing multiple registers
-                     // to handleAssignments to do this
-                     [&](ArrayRef<Register> Regs, Register DstReg, LLT LLTy,
-                         LLT PartLLT, int VTSplitIdx) {
-                       assert(DstReg == VRegs[Idx][VTSplitIdx]);
-                       packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
-                                               LLTy, PartLLT);
-                     });
-
+    splitToValueTypes(B, OrigArg, SplitArgs, DL, CC);
     ++Idx;
   }
 
@@ -1356,19 +1232,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
                     Info.DemoteRegister, Info.DemoteStackIndex);
   } else if (!Info.OrigRet.Ty->isVoidTy()) {
-    SmallVector<ArgInfo, 8> PreSplitRetInfos;
-
-    splitToValueTypes(
-      MIRBuilder, Info.OrigRet, PreSplitRetInfos/*InArgs*/, DL, Info.CallConv);
-
-    processSplitArgs(MIRBuilder, Info.OrigRet,
-                     PreSplitRetInfos, InArgs/*SplitRetInfos*/, DL, Info.CallConv, false,
-                     [&](ArrayRef<Register> Regs, Register DstReg,
-                         LLT LLTy, LLT PartLLT, int VTSplitIdx) {
-                       assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
-                       packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx],
-                                               Regs, LLTy, PartLLT);
-                     });
+    splitToValueTypes(MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv);
   }
 
   // Make sure the raw argument copies are inserted before the marshalling to

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index c01dd73e1e3c..9413e907f499 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -285,15 +285,8 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
 define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
+; GFX6-NEXT:    s_xor_b32 s0, s3, -1
+; GFX6-NEXT:    s_and_b32 s0, s2, s0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_v2i16:
@@ -309,15 +302,8 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_and_b32 s0, s1, s0
+; GFX6-NEXT:    s_xor_b32 s0, s3, -1
+; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_v2i16_commute:
@@ -333,15 +319,8 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
+; GFX6-NEXT:    s_xor_b32 s1, s3, -1
+; GFX6-NEXT:    s_and_b32 s0, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_v2i16_multi_use:
@@ -362,19 +341,9 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
 ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
-; GFX6-NEXT:    s_and_b32 s1, s6, s1
-; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_and_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s2, s1
+; GFX6-NEXT:    s_xor_b32 s1, s4, -1
+; GFX6-NEXT:    s_and_b32 s0, s2, s1
+; GFX6-NEXT:    s_and_b32 s1, s3, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
@@ -397,13 +366,6 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index be25cc78a18c..0246d87861e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -473,13 +473,13 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v2, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GFX6-NEXT:    v_bfe_i32 v1, v2, 0, 16
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i16:
@@ -503,6 +503,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_ashr_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 15, v0
@@ -531,12 +532,13 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX6-LABEL: s_ashr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    s_and_b32 s1, s1, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
-; GFX6-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
-; GFX6-NEXT:    s_sext_i32_i16 s1, s1
-; GFX6-NEXT:    s_ashr_i32 s1, s1, s2
+; GFX6-NEXT:    s_ashr_i32 s0, s0, s1
+; GFX6-NEXT:    s_sext_i32_i16 s1, s2
+; GFX6-NEXT:    s_ashr_i32 s1, s1, s3
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -577,10 +579,11 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 ; GFX6-LABEL: ashr_v2i16_sv:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX6-NEXT:    v_ashr_i32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s1
 ; GFX6-NEXT:    v_ashr_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
@@ -610,13 +613,14 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: ashr_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index e3775b62a4d2..91a30467f727 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -369,19 +369,18 @@ define i16 @v_bswap_i16(i16 %src) {
 define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
 ; GFX7-LABEL: s_bswap_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s3, 0xffff
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX7-NEXT:    s_and_b32 s3, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s2, s0, 8
-; GFX7-NEXT:    s_and_b32 s0, s0, s3
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 8
-; GFX7-NEXT:    s_or_b32 s0, s0, s2
-; GFX7-NEXT:    s_lshl_b32 s2, s1, 8
-; GFX7-NEXT:    s_and_b32 s1, s1, s3
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 8
-; GFX7-NEXT:    s_or_b32 s1, s1, s2
-; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x100000
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x100000
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x100000
+; GFX7-NEXT:    s_or_b32 s2, s3, s2
+; GFX7-NEXT:    s_bfe_u32 s1, s2, 0x100000
+; GFX7-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_bswap_v2i16:
@@ -468,15 +467,15 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; GFX7-LABEL: v_bswap_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 81f6b8e71254..788eab69d7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -67,22 +67,26 @@ define i16 @halfinsts_add_i16(i16 %arg0) #1 {
 define <2 x i16> @halfinsts_add_v2i16(<2 x i16> %arg0) #1 {
   ; CHECK-LABEL: name: halfinsts_add_v2i16
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
-  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY4]]
-  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-  ; CHECK:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[COPY6]]
-  ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
-  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
-  ; CHECK:   $vgpr0 = COPY [[COPY7]](s32)
-  ; CHECK:   $vgpr1 = COPY [[COPY8]](s32)
-  ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
-  ; CHECK:   S_SETPC_B64_return [[COPY9]], implicit $vgpr0, implicit $vgpr1
+  ; CHECK:   liveins: $vgpr0, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+  ; CHECK:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+  ; CHECK:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+  ; CHECK:   [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32)
+  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32)
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32)
+  ; CHECK:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[COPY5]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32)
+  ; CHECK:   $vgpr0 = COPY [[COPY6]](s32)
+  ; CHECK:   $vgpr1 = COPY [[COPY7]](s32)
+  ; CHECK:   [[COPY8:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]]
+  ; CHECK:   S_SETPC_B64_return [[COPY8]], implicit $vgpr0, implicit $vgpr1
   %add = add <2 x i16> %arg0, %arg0
   ret <2 x i16> %add
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 47bf7004f0ff..f3886ad36eda 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -406,44 +406,48 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v3, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v6
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v6, v1, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v1, v6
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
@@ -452,25 +456,25 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v5
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v5, v1, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v5
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -525,15 +529,17 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -567,44 +573,48 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v3, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v6
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v6, v1, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v1, v6
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
@@ -613,25 +623,25 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v5
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v5, v1, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v5
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -686,32 +696,33 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v2, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -719,11 +730,12 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
@@ -733,24 +745,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v4, v5, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v6, v3
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -801,32 +813,33 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v2, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -834,11 +847,12 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
@@ -848,24 +862,24 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v4, v5, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v6, v3
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -916,14 +930,15 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -953,32 +968,33 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v2, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -986,11 +1002,12 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_movk_i32 s6, 0x3c00
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
@@ -1000,24 +1017,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s6
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v4, v5, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v1, -v1, v6, v3
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1047,15 +1064,17 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1089,44 +1108,48 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
 ; GFX6-IEEE-NEXT:    v_fma_f32 v8, -v4, v7, v6
 ; GFX6-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-IEEE-NEXT:    v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v0, v4, v5, v7
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v0, v3, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v6
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v6, v1, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v7, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v1, v6
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, v2
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v6, vcc, v2, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v7, v5, v5
@@ -1135,25 +1158,25 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v7, v8, v5, v7
 ; GFX6-FLUSH-NEXT:    v_fma_f32 v4, -v4, v7, v6
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v4, v3, v2
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v5
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v5, v1, v5
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v4
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v5
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1208,15 +1231,17 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index d8fbcfaac200..1638d3d06e4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -80,15 +80,18 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX6-NEXT:    v_fma_f32 v1, v3, v4, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -118,20 +121,19 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX6-NEXT:    v_fma_f32 v1, v3, v4, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -163,20 +165,19 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX6-NEXT:    v_fma_f32 v1, v3, v4, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -208,27 +209,21 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v6, 0xffff
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v6
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v4
+; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_fma_f32 v1, v2, v3, v5
+; GFX6-NEXT:    v_fma_f32 v1, v3, v4, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index ad8f1d17b8dd..530830029c13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -109,15 +109,17 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_log_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v2, v3
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -169,23 +171,21 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_log_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v2, v3
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v1
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
@@ -237,20 +237,18 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT:    v_log_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
+; GFX6-NEXT:    v_log_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v2, v3
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -305,15 +303,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    s_mov_b32 s4, 0x80008000
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
index 3cd0e0fdc4ac..2a9bd6bcf1d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll
@@ -1194,9 +1194,9 @@ define <33 x i32> @v33i32_func_v33i32_i32(<33 x i32> addrspace(1)* %p, i32 %idx)
   ; CHECK:   [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll
index 9a0718e1a629..2410fcb31269 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.a16.ll
@@ -14,8 +14,8 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i16 %s) {
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[DEF]](s32)
@@ -37,8 +37,8 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i16 %s) {
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[DEF]](s32)
@@ -66,9 +66,9 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i16 %s, i16 %t)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
@@ -90,9 +90,9 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i16 %s, i16 %t)
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
@@ -120,10 +120,10 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i16 %s, i16 %t,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -149,10 +149,10 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i16 %s, i16 %t,
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -184,9 +184,9 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i16 %s, i1
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
@@ -208,9 +208,9 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i16 %s, i1
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32)
@@ -238,10 +238,10 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i16 %s, i1
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -267,10 +267,10 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i16 %s, i1
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -302,10 +302,10 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i16 %s, i16 %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -331,10 +331,10 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i16 %s, i16 %
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -366,13 +366,13 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32)
@@ -389,13 +389,13 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[DEF]](s32)
@@ -418,14 +418,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -442,14 +442,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -472,15 +472,15 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -501,15 +501,15 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -536,14 +536,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -560,14 +560,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -590,15 +590,15 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -619,15 +619,15 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -654,15 +654,15 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -683,15 +683,15 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 37cad0496345..df7299573590 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -2884,8 +2884,8 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg
 ; GPRIDX-NEXT:    s_mov_b32 s28, s30
 ; GPRIDX-NEXT:    s_mov_b32 s29, s31
 ; GPRIDX-NEXT:    s_mov_b32 s31, s33
-; GPRIDX-NEXT:    v_mov_b32_e32 v32, v0
 ; GPRIDX-NEXT:    s_mov_b32 s30, s32
+; GPRIDX-NEXT:    v_mov_b32_e32 v32, v0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, s0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, s1
 ; GPRIDX-NEXT:    v_mov_b32_e32 v2, s2
@@ -2956,8 +2956,8 @@ define amdgpu_ps <32 x float> @dyn_insertelement_v32f32_s_v_s(<32 x float> inreg
 ; MOVREL-NEXT:    s_mov_b32 s28, s30
 ; MOVREL-NEXT:    s_mov_b32 s29, s31
 ; MOVREL-NEXT:    s_mov_b32 s31, s33
-; MOVREL-NEXT:    v_mov_b32_e32 v32, v0
 ; MOVREL-NEXT:    s_mov_b32 s30, s32
+; MOVREL-NEXT:    v_mov_b32_e32 v32, v0
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s0
 ; MOVREL-NEXT:    s_mov_b32 m0, s34
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
index 9dff189a5068..556cce5dbb71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -131,8 +131,8 @@ define amdgpu_gfx void @test_gfx_call_external_i32_func_i32_imm(i32 addrspace(1)
   ; GCN:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; GCN:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GCN:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; GCN:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_gfx_i32_func_i32
@@ -838,9 +838,9 @@ define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (volatile store 6 into `i48 addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call i48 @external_i48_func_void()
@@ -896,9 +896,9 @@ define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48)
   ; GCN:   G_STORE [[ZEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
@@ -956,9 +956,9 @@ define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48)
   ; GCN:   G_STORE [[SEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
@@ -1016,8 +1016,8 @@ define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call i64 @external_i64_func_void()
@@ -1073,8 +1073,8 @@ define amdgpu_kernel void @test_call_external_p1_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_p1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[MV]](p1), [[DEF]](p1) :: (volatile store 8 into `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call i8 addrspace(1)* @external_p1_func_void()
@@ -1132,10 +1132,10 @@ define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 {
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (volatile store 16 into `<2 x i8 addrspace(1)*> addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <2 x i8 addrspace(1)*> @external_v2p1_func_void()
@@ -1246,8 +1246,8 @@ define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2p3_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(p3) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY21]](p3), [[COPY22]](p3)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p3) :: (volatile store 8 into `<2 x i8 addrspace(3)*> addrspace(3)* undef`, addrspace 3)
   ; GCN:   S_ENDPGM 0
   %val = call <2 x i8 addrspace(3)*> @external_v2p3_func_void()
@@ -1414,8 +1414,8 @@ define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `double addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call double @external_f64_func_void()
@@ -1473,10 +1473,10 @@ define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 {
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
   ; GCN:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (volatile store 16 into `<2 x double> addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <2 x double> @external_v2f64_func_void()
@@ -1532,8 +1532,8 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (volatile store 8 into `<2 x i32> addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <2 x i32> @external_v2i32_func_void()
@@ -1590,8 +1590,8 @@ define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <3 x i32> @external_v3i32_func_void()
@@ -1649,8 +1649,8 @@ define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (volatile store 16 into `<4 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <4 x i32> @external_v4i32_func_void()
@@ -1709,8 +1709,8 @@ define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <5 x i32> @external_v5i32_func_void()
@@ -1772,8 +1772,8 @@ define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
   ; GCN:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GCN:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GCN:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (volatile store 32 into `<8 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <8 x i32> @external_v8i32_func_void()
@@ -1843,8 +1843,8 @@ define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
   ; GCN:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13
   ; GCN:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; GCN:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (volatile store 64 into `<16 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <16 x i32> @external_v16i32_func_void()
@@ -1930,8 +1930,8 @@ define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
   ; GCN:   [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; GCN:   [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; GCN:   [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr31
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32), [[COPY51]](s32), [[COPY52]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <32 x i32> @external_v32i32_func_void()
@@ -2042,10 +2042,10 @@ define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>)
   ; GCN:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <3 x i16> @external_v3i16_func_void()
@@ -2101,8 +2101,8 @@ define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x i16> addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <4 x i16> @external_v4i16_func_void()
@@ -2213,10 +2213,10 @@ define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>)
   ; GCN:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <3 x half> @external_v3f16_func_void()
@@ -2272,8 +2272,8 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
   ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
   ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x half> addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <4 x half> @external_v4f16_func_void()
@@ -2330,8 +2330,8 @@ define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
   ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x float> addrspace(1)* undef`, align 16, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <3 x float> @external_v3f32_func_void()
@@ -2390,8 +2390,8 @@ define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x float> addrspace(1)* undef`, align 32, addrspace 1)
   ; GCN:   S_ENDPGM 0
   %val = call <5 x float> @external_v5f32_func_void()
@@ -2450,8 +2450,8 @@ define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
   ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN:   G_STORE [[MV]](s64), [[COPY10]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0
@@ -2478,8 +2478,8 @@ define amdgpu_gfx void @test_gfx_call_external_i32_i64_func_void() #0 {
   ; GCN:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; GCN:   G_STORE [[COPY3]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN:   G_STORE [[MV]](s64), [[COPY1]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; GCN:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index 3788c81edd6c..df781c39bc0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -82,11 +82,11 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -103,11 +103,11 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   %7:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -124,11 +124,11 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 3bf46196fd58..abec48ca8d93 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -308,9 +308,9 @@ define void @void_func_i48(i48 %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (store 6 into `i48 addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -325,9 +325,9 @@ define void @void_func_i48_zeroext(i48 zeroext %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48)
@@ -347,9 +347,9 @@ define void @void_func_i48_signext(i48 signext %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48)
@@ -369,8 +369,8 @@ define void @void_func_i64(i64 %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[MV]](s64), [[DEF]](p1) :: (store 8 into `i64 addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -386,9 +386,9 @@ define void @void_func_i95(i95 %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC]](s95), [[DEF]](p1) :: (store 12 into `i95 addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -404,9 +404,9 @@ define void @void_func_i95_zeroext(i95 zeroext %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 12
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s96) = G_ZEXT [[TRUNC]](s95)
@@ -427,9 +427,9 @@ define void @void_func_i95_signext(i95 signext %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s95) = G_TRUNC [[MV]](s96)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 12
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[SEXT:%[0-9]+]]:_(s96) = G_SEXT [[TRUNC]](s95)
@@ -450,8 +450,8 @@ define void @void_func_i96(i96 %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[MV]](s96), [[DEF]](p1) :: (store 12 into `i96 addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -466,8 +466,8 @@ define void @void_func_p0i8(i8* %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[MV]](p0), [[DEF]](p1) :: (store 8 into `i8* addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -482,8 +482,8 @@ define void @void_func_p1i8(i8 addrspace(1)* %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[MV]](p1), [[DEF]](p1) :: (store 8 into `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -527,8 +527,8 @@ define void @void_func_f64(double %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[MV]](s64), [[DEF]](p1) :: (store 8 into `double addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -543,8 +543,8 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (store 8 into `<2 x i32> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -559,9 +559,9 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(<2 x s24>) = G_TRUNC [[BUILD_VECTOR]](<2 x s32>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC]](<2 x s24>), [[DEF]](p1) :: (store 6 into `<2 x i24> addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -577,9 +577,9 @@ define void @void_func_v3i24(<3 x i24> %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(<3 x s24>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC]](<3 x s24>), [[DEF]](p1) :: (store 9 into `<3 x i24> addrspace(1)* undef`, align 16, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -596,9 +596,9 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
   ; CHECK:   [[TRUNC2:%[0-9]+]]:_(<2 x s8>) = G_TRUNC [[BUILD_VECTOR]](<2 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC2]](<2 x s8>), [[DEF]](p1) :: (store 2 into `<2 x i8> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -617,9 +617,9 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
   ; CHECK:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16)
   ; CHECK:   [[TRUNC3:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[BUILD_VECTOR]](<3 x s16>)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC3]](<3 x s8>), [[DEF]](p1) :: (store 3 into `<3 x i8> addrspace(1)* undef`, align 4, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -640,9 +640,9 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
   ; CHECK:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
   ; CHECK:   [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s16>)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC4]](<4 x s8>), [[DEF]](p1) :: (store 4 into `<4 x i8> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -657,8 +657,8 @@ define void @void_func_v2p3i8(<2 x i8 addrspace(3)*> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY]](p3), [[COPY1]](p3)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p1) :: (store 8 into `<2 x i8 addrspace(3)*> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -674,8 +674,8 @@ define void @void_func_v3i32(<3 x i32> %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (store 12 into `<3 x i32> addrspace(1)* undef`, align 16, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -692,8 +692,8 @@ define void @void_func_v4i32(<4 x i32> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x i32> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -711,8 +711,8 @@ define void @void_func_v5i32(<5 x i32> %arg0) #0 {
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32)
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (store 20 into `<5 x i32> addrspace(1)* undef`, align 32, addrspace 1)
   ; CHECK:   [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]]
@@ -733,8 +733,8 @@ define void @void_func_v8i32(<8 x i32> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (store 32 into `<8 x i32> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -763,8 +763,8 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
   ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
   ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
+  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (store 64 into `<16 x i32> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
@@ -809,8 +809,8 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
@@ -858,8 +858,8 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.0, align 16, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<33 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[LOAD]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<33 x s32>), [[DEF]](p1) :: (store 132 into `<33 x i32> addrspace(1)* undef`, align 256, addrspace 1)
   ; CHECK:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
@@ -876,10 +876,10 @@ define void @void_func_v2i64(<2 x i64> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<2 x i64> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -896,10 +896,10 @@ define void @void_func_v2p0i8(<2 x i8*> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[MV]](p0), [[MV1]](p0)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x p0>), [[DEF]](p1) :: (store 16 into `<2 x i8*> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -916,10 +916,10 @@ define void @void_func_v2p1i8(<2 x i8 addrspace(1)*> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (store 16 into `<2 x i8 addrspace(1)*> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -938,11 +938,11 @@ define void @void_func_v3i64(<3 x i64> %arg0) #0 {
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<3 x s64>), [[DEF]](p1) :: (store 24 into `<3 x i64> addrspace(1)* undef`, align 32, addrspace 1)
   ; CHECK:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
@@ -963,12 +963,12 @@ define void @void_func_v4i64(<4 x i64> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<4 x s64>), [[DEF]](p1) :: (store 32 into `<4 x i64> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -991,13 +991,13 @@ define void @void_func_v5i64(<5 x i64> %arg0) #0 {
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
   ; CHECK:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
-  ; CHECK:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64)
+  ; CHECK:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<5 x s64>), [[DEF]](p1) :: (store 40 into `<5 x i64> addrspace(1)* undef`, align 64, addrspace 1)
   ; CHECK:   [[COPY11:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]]
@@ -1026,7 +1026,6 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
   ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
   ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
@@ -1036,6 +1035,7 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
   ; CHECK:   [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
   ; CHECK:   [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<8 x s64>), [[DEF]](p1) :: (store 64 into `<8 x i64> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
@@ -1080,7 +1080,6 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
@@ -1098,6 +1097,7 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
   ; CHECK:   [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32)
   ; CHECK:   [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store 128 into `<16 x i64> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
@@ -1126,10 +1126,10 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -1144,8 +1144,8 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (store 8 into `<4 x i16> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -1161,10 +1161,10 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<10 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK:   [[UV:%[0-9]+]]:_(<5 x s16>), [[UV1:%[0-9]+]]:_(<5 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<10 x s16>)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[UV]](<5 x s16>), [[DEF1]](p1) :: (store 10 into `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -1181,8 +1181,8 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<8 x i16> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -1203,8 +1203,8 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<16 x s16>), [[DEF]](p1) :: (store 32 into `<16 x i16> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -1253,10 +1253,10 @@ define void @void_func_v65i16(<65 x i16> %arg0) #0 {
   ; CHECK:   [[COPY31:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr31
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.0, align 16, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<130 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>), [[COPY8]](<2 x s16>), [[COPY9]](<2 x s16>), [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[COPY12]](<2 x s16>), [[COPY13]](<2 x s16>), [[COPY14]](<2 x s16>), [[COPY15]](<2 x s16>), [[COPY16]](<2 x s16>), [[COPY17]](<2 x s16>), [[COPY18]](<2 x s16>), [[COPY19]](<2 x s16>), [[COPY20]](<2 x s16>), [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[COPY23]](<2 x s16>), [[COPY24]](<2 x s16>), [[COPY25]](<2 x s16>), [[COPY26]](<2 x s16>), [[COPY27]](<2 x s16>), [[COPY28]](<2 x s16>), [[COPY29]](<2 x s16>), [[COPY30]](<2 x s16>), [[COPY31]](<2 x s16>), [[LOAD]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK:   [[UV:%[0-9]+]]:_(<65 x s16>), [[UV1:%[0-9]+]]:_(<65 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<130 x s16>)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[UV]](<65 x s16>), [[DEF1]](p1) :: (store 130 into `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1)
   ; CHECK:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
@@ -1271,8 +1271,8 @@ define void @void_func_v2f32(<2 x float> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (store 8 into `<2 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -1288,8 +1288,8 @@ define void @void_func_v3f32(<3 x float> %arg0) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (store 12 into `<3 x float> addrspace(1)* undef`, align 16, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -1306,8 +1306,8 @@ define void @void_func_v4f32(<4 x float> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -1328,8 +1328,8 @@ define void @void_func_v8f32(<8 x float> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (store 32 into `<8 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -1358,8 +1358,8 @@ define void @void_func_v16f32(<16 x float> %arg0) #0 {
   ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
   ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32)
+  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (store 64 into `<16 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
@@ -1376,10 +1376,10 @@ define void @void_func_v2f64(<2 x double> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (store 16 into `<2 x double> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -1398,11 +1398,11 @@ define void @void_func_v3f64(<3 x double> %arg0) #0 {
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64)
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<3 x s64>), [[DEF]](p1) :: (store 24 into `<3 x double> addrspace(1)* undef`, align 32, addrspace 1)
   ; CHECK:   [[COPY7:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY6]]
@@ -1423,12 +1423,12 @@ define void @void_func_v4f64(<4 x double> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<4 x s64>), [[DEF]](p1) :: (store 32 into `<4 x double> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -1457,7 +1457,6 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
   ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
   ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
   ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
-  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
@@ -1467,6 +1466,7 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
   ; CHECK:   [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32)
   ; CHECK:   [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64)
+  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<8 x s64>), [[DEF]](p1) :: (store 64 into `<8 x double> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
@@ -1511,7 +1511,6 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
@@ -1529,6 +1528,7 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
   ; CHECK:   [[MV14:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY28]](s32), [[COPY29]](s32)
   ; CHECK:   [[MV15:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64), [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64), [[MV12]](s64), [[MV13]](s64), [[MV14]](s64), [[MV15]](s64)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<16 x s64>), [[DEF]](p1) :: (store 128 into `<16 x double> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY33:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
@@ -1557,10 +1557,10 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>)
   ; CHECK:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[UV]](<3 x s16>), [[DEF1]](p1) :: (store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -1575,8 +1575,8 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (store 8 into `<4 x half> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]]
@@ -1593,8 +1593,8 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 {
   ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<8 x s16>), [[DEF]](p1) :: (store 16 into `<8 x half> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]]
@@ -1615,8 +1615,8 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
   ; CHECK:   [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[COPY6]](<2 x s16>), [[COPY7]](<2 x s16>)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[CONCAT_VECTORS]](<16 x s16>), [[DEF]](p1) :: (store 32 into `<16 x half> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY9:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY8]]
@@ -1633,9 +1633,9 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY5:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[COPY]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
@@ -1803,15 +1803,15 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.2, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.1, addrspace 5)
   ; CHECK:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.0, align 8, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD2]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -1863,6 +1863,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 1 from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
@@ -1872,7 +1873,6 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ; CHECK:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 2 from %fixed-stack.0, align 4, addrspace 5)
   ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -1929,12 +1929,12 @@ define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK:   [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.1, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD1:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
   ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -1985,18 +1985,18 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
   ; CHECK:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.2, addrspace 5)
+  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
   ; CHECK:   [[FRAME_INDEX2:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
-  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD2]](s32), [[LOAD3]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2047,12 +2047,12 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
   ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.1, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
   ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2103,6 +2103,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.7, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
@@ -2111,6 +2112,9 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
   ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.5, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
   ; CHECK:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+  ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
+  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
   ; CHECK:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK:   [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load 4 from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
@@ -2119,14 +2123,10 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
   ; CHECK:   [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
-  ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
-  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD4]](s32), [[LOAD5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2177,6 +2177,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.7, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
@@ -2185,6 +2186,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
   ; CHECK:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p5) :: (invariant load 4 from %fixed-stack.5, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX3:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.4
   ; CHECK:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p5) :: (invariant load 4 from %fixed-stack.4, addrspace 5)
+  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
   ; CHECK:   [[FRAME_INDEX4:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.3
   ; CHECK:   [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX4]](p5) :: (invariant load 4 from %fixed-stack.3, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX5:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.2
@@ -2193,10 +2195,8 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
   ; CHECK:   [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
-  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2247,6 +2247,7 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.15, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14
@@ -2263,6 +2264,7 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
   ; CHECK:   [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX6]](p5) :: (invariant load 4 from %fixed-stack.9, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX7:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.8
   ; CHECK:   [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX7]](p5) :: (invariant load 4 from %fixed-stack.8, addrspace 5)
+  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
   ; CHECK:   [[FRAME_INDEX8:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.7
   ; CHECK:   [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX8]](p5) :: (invariant load 4 from %fixed-stack.7, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX9:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.6
@@ -2279,10 +2281,8 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
   ; CHECK:   [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load 4 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
-  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2333,6 +2333,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.31
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.31, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.30
@@ -2365,6 +2366,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
   ; CHECK:   [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load 4 from %fixed-stack.17, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.16
   ; CHECK:   [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load 4 from %fixed-stack.16, addrspace 5)
+  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
   ; CHECK:   [[FRAME_INDEX16:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15
   ; CHECK:   [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX16]](p5) :: (invariant load 4 from %fixed-stack.15, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX17:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14
@@ -2397,10 +2399,8 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
   ; CHECK:   [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX30]](p5) :: (invariant load 4 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX31:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX31]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
-  ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD16]](s32), [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32), [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32), [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32), [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
@@ -2423,9 +2423,9 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -2457,9 +2457,9 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -2520,9 +2520,9 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
   ; CHECK:   [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[COPY14]](s32)
   ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
   ; CHECK:   [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[COPY15]](s32)
-  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16), [[TRUNC4]](s16), [[TRUNC5]](s16), [[TRUNC6]](s16), [[TRUNC7]](s16), [[TRUNC8]](s16), [[TRUNC9]](s16), [[TRUNC10]](s16), [[TRUNC11]](s16), [[TRUNC12]](s16), [[TRUNC13]](s16), [[TRUNC14]](s16), [[TRUNC15]](s16)
   ; CHECK:   [[TRUNC16:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR]](<16 x s16>)
+  ; CHECK:   [[COPY16:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   G_STORE [[TRUNC16]](<16 x s8>), [[DEF]](p1) :: (volatile store 16 into `<16 x i8> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   [[COPY17:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY16]]
@@ -2568,6 +2568,7 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
   ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
   ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.15
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 2 from %fixed-stack.15, align 16, addrspace 5)
   ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.14
@@ -2600,10 +2601,9 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
   ; CHECK:   [[LOAD14:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX14]](p5) :: (invariant load 2 from %fixed-stack.1, align 8, addrspace 5)
   ; CHECK:   [[FRAME_INDEX15:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
   ; CHECK:   [[LOAD15:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX15]](p5) :: (invariant load 2 from %fixed-stack.0, align 4, addrspace 5)
-  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s16>) = G_BUILD_VECTOR [[LOAD]](s16), [[LOAD1]](s16), [[LOAD2]](s16), [[LOAD3]](s16), [[LOAD4]](s16), [[LOAD5]](s16), [[LOAD6]](s16), [[LOAD7]](s16), [[LOAD8]](s16), [[LOAD9]](s16), [[LOAD10]](s16), [[LOAD11]](s16), [[LOAD12]](s16), [[LOAD13]](s16), [[LOAD14]](s16), [[LOAD15]](s16)
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR1]](<16 x s16>)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
index 7a11f1b363f0..b12953fa6403 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
@@ -10,17 +10,17 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64(<2 x i32 addrspace(1
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-  ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
   ; CHECK:   [[MUL:%[0-9]+]]:_(<2 x s64>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
@@ -44,11 +44,11 @@ define <2 x i32 addrspace(3)*> @vector_gep_v2p3_index_v2i32(<2 x i32 addrspace(3
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY]](p3), [[COPY1]](p3)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY]](p3), [[COPY1]](p3)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
   ; CHECK:   [[MUL:%[0-9]+]]:_(<2 x s32>) = G_MUL [[BUILD_VECTOR1]], [[BUILD_VECTOR2]]
@@ -72,13 +72,13 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i32(<2 x i32 addrspace(1
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
@@ -105,13 +105,13 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i64(<2 x i32 addrspace(1)*
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV2]](s64)
   ; CHECK:   [[COPY7:%[0-9]+]]:_(<2 x s64>) = COPY [[BUILD_VECTOR1]](<2 x s64>)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
@@ -139,11 +139,11 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_i32(<2 x i32 addrspace(1)*
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY4]](s32)
   ; CHECK:   [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
@@ -171,17 +171,17 @@ define <2 x i32 addrspace(1)*> @vector_gep_v2p1_index_v2i64_constant(<2 x i32 ad
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
-  ; CHECK:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
   ; CHECK:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; CHECK:   [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV2]](s64), [[MV3]](s64)
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
   ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C1]](s64)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
index 44d662ef0947..ebf1b1596d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll
@@ -59,8 +59,8 @@ define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(void()* %fptr) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:sreg_64(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK:   [[COPY3:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY3]](<4 x s32>)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
index f0e79511d959..5c94c6fe5bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
@@ -9,9 +9,9 @@ define void @test_memcpy_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
@@ -27,9 +27,9 @@ define void @test_memcpy_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -44,9 +44,9 @@ define void @test_memcpy_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src)
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK:   G_MEMCPY [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
@@ -63,8 +63,8 @@ define void @test_memcpy_p3_p1_i64(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1)
@@ -81,8 +81,8 @@ define void @test_memcpy_p3_p1_i32(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -98,8 +98,8 @@ define void @test_memcpy_p3_p1_i16(i8 addrspace(3)* %dst, i8 addrspace(1)* %src)
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK:   G_MEMCPY [[COPY]](p3), [[MV]](p1), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 3), (load 1 from %ir.src, addrspace 1)
@@ -115,9 +115,9 @@ define void @test_memmove_p1_p3_i64(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64)
   ; CHECK:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[TRUNC]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
@@ -133,9 +133,9 @@ define void @test_memmove_p1_p3_i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[C]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -150,9 +150,9 @@ define void @test_memmove_p1_p3_i16(i8 addrspace(1)* %dst, i8 addrspace(3)* %src
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(p3) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[C]](s16)
   ; CHECK:   G_MEMMOVE [[MV]](p1), [[COPY2]](p3), [[ZEXT]](s32), 0 :: (store 1 into %ir.dst, addrspace 1), (load 1 from %ir.src, addrspace 3)
@@ -168,10 +168,10 @@ define void @test_memset_p1_i64(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256
   ; CHECK:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[C]](s64), 0 :: (store 1 into %ir.dst, addrspace 1)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]
@@ -186,10 +186,10 @@ define void @test_memset_p1_i32(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s32)
   ; CHECK:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store 1 into %ir.dst, addrspace 1)
@@ -205,10 +205,10 @@ define void @test_memset_p1_i16(i8 addrspace(1)* %dst, i8 %val) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 256
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[C]](s16)
   ; CHECK:   G_MEMSET [[MV]](p1), [[TRUNC]](s8), [[ZEXT]](s64), 0 :: (store 1 into %ir.dst, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
index 4f56e1ae598d..3c8ca3d35749 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
@@ -7,11 +7,11 @@ define i8* @ptrmask_flat_i64(i8* %ptr, i64 %mask) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[MV1]](s64)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -28,9 +28,9 @@ define i8* @ptrmask_flat_i32(i8* %ptr, i32 %mask) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[COPY2]](s32)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -47,10 +47,10 @@ define i8* @ptrmask_flat_i16(i8* %ptr, i16 %mask) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s16)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -67,10 +67,10 @@ define i8* @ptrmask_flat_i1(i8* %ptr, i1 %mask) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[MV]], [[TRUNC]](s1)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTRMASK]](p0)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -88,8 +88,8 @@ define i8 addrspace(3)* @ptrmask_local_i64(i8 addrspace(3)* %ptr, i64 %mask) {
   ; CHECK:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[PTRMASK:%[0-9]+]]:_(p3) = G_PTRMASK [[COPY]], [[MV]](s64)
   ; CHECK:   $vgpr0 = COPY [[PTRMASK]](p3)
   ; CHECK:   [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
index 18e10a1fc390..ada7cf54f544 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
@@ -42,11 +42,11 @@ define i64 @uaddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[UADDSAT:%[0-9]+]]:_(s64) = G_UADDSAT [[MV]], [[MV1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -64,11 +64,11 @@ define <2 x i32> @uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[UADDSAT:%[0-9]+]]:_(<2 x s32>) = G_UADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -121,11 +121,11 @@ define i64 @saddsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SADDSAT:%[0-9]+]]:_(s64) = G_SADDSAT [[MV]], [[MV1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -143,11 +143,11 @@ define <2 x i32> @saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SADDSAT:%[0-9]+]]:_(<2 x s32>) = G_SADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -200,11 +200,11 @@ define i64 @usubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[USUBSAT:%[0-9]+]]:_(s64) = G_USUBSAT [[MV]], [[MV1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -222,11 +222,11 @@ define <2 x i32> @usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[USUBSAT:%[0-9]+]]:_(<2 x s32>) = G_USUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -279,11 +279,11 @@ define i64 @ssubsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SSUBSAT:%[0-9]+]]:_(s64) = G_SSUBSAT [[MV]], [[MV1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -301,11 +301,11 @@ define <2 x i32> @ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SSUBSAT:%[0-9]+]]:_(<2 x s32>) = G_SSUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -358,11 +358,11 @@ define i64 @ushlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[USHLSAT:%[0-9]+]]:_(s64) = G_USHLSAT [[MV]], [[MV1]](s64)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -380,11 +380,11 @@ define <2 x i32> @ushlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[USHLSAT:%[0-9]+]]:_(<2 x s32>) = G_USHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USHLSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -437,11 +437,11 @@ define i64 @sshlsat_i64(i64 %lhs, i64 %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SSHLSAT:%[0-9]+]]:_(s64) = G_SSHLSAT [[MV]], [[MV1]](s64)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](s64)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)
@@ -459,11 +459,11 @@ define <2 x i32> @sshlsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31
   ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
   ; CHECK:   [[SSHLSAT:%[0-9]+]]:_(<2 x s32>) = G_SSHLSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s32>)
   ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSHLSAT]](<2 x s32>)
   ; CHECK:   $vgpr0 = COPY [[UV]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
index 376323e3f688..00a4b468e7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.atomic.dim.a16.ll
@@ -14,10 +14,10 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -32,10 +32,10 @@ define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.swap.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -57,10 +57,10 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -75,10 +75,10 @@ define amdgpu_ps float @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -100,10 +100,10 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -118,10 +118,10 @@ define amdgpu_ps float @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.sub.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -143,10 +143,10 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -161,10 +161,10 @@ define amdgpu_ps float @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -187,10 +187,10 @@ define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -205,10 +205,10 @@ define amdgpu_ps float @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umin.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -230,10 +230,10 @@ define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -248,10 +248,10 @@ define amdgpu_ps float @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.smax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -273,10 +273,10 @@ define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -291,10 +291,10 @@ define amdgpu_ps float @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.umax.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -316,10 +316,10 @@ define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -334,10 +334,10 @@ define amdgpu_ps float @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.and.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -359,10 +359,10 @@ define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) {
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -377,10 +377,10 @@ define amdgpu_ps float @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s) {
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.or.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -402,10 +402,10 @@ define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -420,10 +420,10 @@ define amdgpu_ps float @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.xor.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -445,10 +445,10 @@ define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -463,10 +463,10 @@ define amdgpu_ps float @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.inc.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -488,10 +488,10 @@ define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -506,10 +506,10 @@ define amdgpu_ps float @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i16 %s)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.dec.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -531,11 +531,11 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
@@ -551,11 +551,11 @@ define amdgpu_ps float @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.cmpswap.1d), [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
@@ -578,10 +578,10 @@ define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -599,10 +599,10 @@ define amdgpu_ps float @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -627,11 +627,11 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -653,11 +653,11 @@ define amdgpu_ps float @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -686,11 +686,11 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -712,11 +712,11 @@ define amdgpu_ps float @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -745,10 +745,10 @@ define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -766,10 +766,10 @@ define amdgpu_ps float @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY11]](s32), [[COPY12]](s32)
@@ -794,11 +794,11 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -820,11 +820,11 @@ define amdgpu_ps float @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -853,11 +853,11 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -879,11 +879,11 @@ define amdgpu_ps float @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32)
@@ -912,12 +912,12 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
@@ -939,12 +939,12 @@ define amdgpu_ps float @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32)
   ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32)
@@ -973,10 +973,10 @@ define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX9:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX9:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -991,10 +991,10 @@ define amdgpu_ps float @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.atomic.add.1d), [[COPY8]](s32), [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (volatile dereferenceable load store 4 on custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -1016,11 +1016,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
@@ -1039,11 +1039,11 @@ define amdgpu_ps float @atomic_cmpswap_2d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
@@ -1069,12 +1069,12 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
@@ -1097,12 +1097,12 @@ define amdgpu_ps float @atomic_cmpswap_3d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
@@ -1132,13 +1132,13 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)
@@ -1161,13 +1161,13 @@ define amdgpu_ps float @atomic_cmpswap_2darraymsaa(<8 x i32> inreg %rsrc, i32 %c
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32)
   ; GFX10NSA:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
index cb47c5983891..1c0ad3692b36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll
@@ -14,8 +14,8 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -38,8 +38,8 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -69,8 +69,8 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -98,8 +98,8 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords)
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -135,9 +135,9 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -171,9 +171,9 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -216,9 +216,9 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -252,9 +252,9 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -297,8 +297,8 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -326,8 +326,8 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -363,9 +363,9 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -399,9 +399,9 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -444,9 +444,9 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -480,9 +480,9 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -525,9 +525,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -563,9 +563,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -611,8 +611,8 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -640,8 +640,8 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -677,9 +677,9 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -713,9 +713,9 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -758,9 +758,9 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -796,9 +796,9 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -844,9 +844,9 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -882,9 +882,9 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -930,9 +930,9 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -966,9 +966,9 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1011,9 +1011,9 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1049,9 +1049,9 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1097,13 +1097,13 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1121,13 +1121,13 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1152,13 +1152,13 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1181,13 +1181,13 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1218,14 +1218,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1254,14 +1254,14 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1299,14 +1299,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1335,14 +1335,14 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1380,13 +1380,13 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1409,13 +1409,13 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1446,14 +1446,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1482,14 +1482,14 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1527,14 +1527,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1563,14 +1563,14 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1608,14 +1608,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1646,14 +1646,14 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1694,13 +1694,13 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1723,13 +1723,13 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1760,14 +1760,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1796,14 +1796,14 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1841,14 +1841,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1879,14 +1879,14 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1927,14 +1927,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -1965,14 +1965,14 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2013,14 +2013,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2049,14 +2049,14 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2094,14 +2094,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2132,14 +2132,14 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2180,8 +2180,8 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2204,8 +2204,8 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2235,8 +2235,8 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2259,8 +2259,8 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2290,8 +2290,8 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2314,8 +2314,8 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2345,8 +2345,8 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2369,8 +2369,8 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2400,8 +2400,8 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2424,8 +2424,8 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2455,8 +2455,8 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2479,8 +2479,8 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2510,8 +2510,8 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2534,8 +2534,8 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16>
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2565,8 +2565,8 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2589,8 +2589,8 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2620,8 +2620,8 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2640,8 +2640,8 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2667,8 +2667,8 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2689,8 +2689,8 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2718,9 +2718,9 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2738,9 +2738,9 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2765,11 +2765,11 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2787,11 +2787,11 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
+  ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2816,8 +2816,8 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2840,8 +2840,8 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2871,8 +2871,8 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2895,8 +2895,8 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2926,8 +2926,8 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2950,8 +2950,8 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -2981,13 +2981,13 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3005,13 +3005,13 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3036,13 +3036,13 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3060,13 +3060,13 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3091,13 +3091,13 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX9:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX9:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3115,13 +3115,13 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>)
   ; GFX10NSA:   [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
@@ -3191,8 +3191,8 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3217,8 +3217,8 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3253,8 +3253,8 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3284,8 +3284,8 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
-  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3326,9 +3326,9 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3364,9 +3364,9 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3414,9 +3414,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX9:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)
@@ -3454,9 +3454,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10NSA:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
index 76291879c7c3..9977aac3ee6f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.d16.ll
@@ -14,9 +14,9 @@ define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; UNPACKED:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16)
@@ -33,9 +33,9 @@ define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s16) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; PACKED:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AMDGPU_INTRIN_IMAGE_LOAD]](s16)
@@ -57,9 +57,9 @@ define amdgpu_ps <2 x half> @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -85,9 +85,9 @@ define amdgpu_ps <2 x half> @image_load_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; PACKED:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>)
@@ -108,9 +108,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
@@ -151,9 +151,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
   ; PACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>)
@@ -202,9 +202,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -238,9 +238,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
   ; PACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>)
@@ -263,9 +263,9 @@ define amdgpu_ps half @image_load_tfe_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t)
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -285,9 +285,9 @@ define amdgpu_ps half @image_load_tfe_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t)
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -315,9 +315,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16(<8 x i32> inreg %rsrc, i32 %s,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -345,9 +345,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16(<8 x i32> inreg %rsrc, i32 %s,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -375,9 +375,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
@@ -420,9 +420,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16(<8 x i32> inreg %rsrc, i32 %s,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
@@ -478,9 +478,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16(<8 x i32> inreg %rsrc, i32 %s,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
@@ -516,9 +516,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16(<8 x i32> inreg %rsrc, i32 %s,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
@@ -585,9 +585,9 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; UNPACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
@@ -611,9 +611,9 @@ define amdgpu_ps <2 x half> @image_load_v2f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; PACKED:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s16>)
@@ -671,9 +671,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -712,9 +712,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; PACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -762,9 +762,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -800,9 +800,9 @@ define amdgpu_ps <3 x half> @image_load_v3f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; PACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -941,9 +941,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1110(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
@@ -976,9 +976,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1110(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
   ; PACKED:   [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s16>)
@@ -1001,9 +1001,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; UNPACKED:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -1034,9 +1034,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1100(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; PACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1059,9 +1059,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; UNPACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
@@ -1088,9 +1088,9 @@ define amdgpu_ps <4 x half> @image_load_v4f16_dmask_1000(<8 x i32> inreg %rsrc,
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s16>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
   ; PACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
@@ -1154,9 +1154,9 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1176,9 +1176,9 @@ define amdgpu_ps half @image_load_tfe_f16_dmask_0000(<8 x i32> inreg %rsrc, i32
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1206,9 +1206,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_1000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1235,9 +1235,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_1000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1265,9 +1265,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1294,9 +1294,9 @@ define amdgpu_ps <2 x half> @image_load_tfe_v2f16_dmask_0000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1324,9 +1324,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -1367,9 +1367,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1100(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -1424,9 +1424,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1465,9 +1465,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_1000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1522,9 +1522,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1563,9 +1563,9 @@ define amdgpu_ps <3 x half> @image_load_tfe_v3f16_dmask_0000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1620,9 +1620,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1110(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
@@ -1657,9 +1657,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1110(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 6 from custom "ImageResource", align 8)
@@ -1689,9 +1689,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1100(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -1724,9 +1724,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1100(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -1756,9 +1756,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1788,9 +1788,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_1000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1820,9 +1820,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")
@@ -1852,9 +1852,9 @@ define amdgpu_ps <4 x half> @image_load_tfe_v4f16_dmask_0000(<8 x i32> inreg %rs
   ; PACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; PACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; PACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; PACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; PACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; PACKED:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; PACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; PACKED:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 2 from custom "ImageResource")

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
index 96261ae3236a..c68fa9ca2ca9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2d.ll
@@ -13,9 +13,9 @@ define amdgpu_ps float @image_load_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GCN:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
@@ -36,9 +36,9 @@ define amdgpu_ps <2 x float> @image_load_v2f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -61,9 +61,9 @@ define amdgpu_ps <3 x float> @image_load_v3f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 12 from custom "ImageResource", align 16)
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
@@ -87,9 +87,9 @@ define amdgpu_ps <4 x float> @image_load_v4f32(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -114,9 +114,9 @@ define amdgpu_ps float @image_load_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -143,9 +143,9 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
@@ -173,9 +173,9 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 12 from custom "ImageResource", align 16)
@@ -204,9 +204,9 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32(<8 x i32> inreg %rsrc, i32 %s
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
@@ -257,9 +257,9 @@ define amdgpu_ps <2 x float> @image_load_v2f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -305,9 +305,9 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1100(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -332,9 +332,9 @@ define amdgpu_ps <3 x float> @image_load_v3f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -382,9 +382,9 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1110(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 12 from custom "ImageResource", align 16)
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<3 x s32>)
@@ -410,9 +410,9 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1100(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
   ; GCN:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)
@@ -438,9 +438,9 @@ define amdgpu_ps <4 x float> @image_load_v4f32_dmask_1000(<8 x i32> inreg %rsrc,
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GCN:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -490,9 +490,9 @@ define amdgpu_ps float @image_load_tfe_f32_dmask_0000(<8 x i32> inreg %rsrc, i32
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -519,9 +519,9 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_1000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -550,9 +550,9 @@ define amdgpu_ps <2 x float> @image_load_tfe_v2f32_dmask_0000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -581,9 +581,9 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1100(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
@@ -613,9 +613,9 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_1000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -645,9 +645,9 @@ define amdgpu_ps <3 x float> @image_load_tfe_v3f32_dmask_0000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -677,9 +677,9 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1110(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 12 from custom "ImageResource", align 16)
@@ -710,9 +710,9 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1100(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<3 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 8 from custom "ImageResource")
@@ -743,9 +743,9 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_1000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -776,9 +776,9 @@ define amdgpu_ps <4 x float> @image_load_tfe_v4f32_dmask_0000(<8 x i32> inreg %r
   ; GCN:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GCN:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GCN:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GCN:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GCN:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GCN:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
index ea3b5797025a..0cc6fc36e7e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
@@ -14,11 +14,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX6:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX6:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX6:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX6:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX6:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX6:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX6:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX6:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
@@ -38,11 +38,11 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10NSA:   $vgpr0 = COPY [[UV]](s32)
@@ -66,14 +66,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
   ; GFX6:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX6:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX6:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX6:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
+  ; GFX6:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; GFX6:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX6:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX6:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX6:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX6:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; GFX6:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32)
   ; GFX6:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[BUILD_VECTOR1]](<4 x s32>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX6:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
@@ -94,14 +94,14 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
+  ; GFX10NSA:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10NSA:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10NSA:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>)
   ; GFX10NSA:   G_STORE [[UV4]](s32), [[MV]](p1) :: (store 4 into %ir.out, addrspace 1)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
index e3d384a8b8c8..60baab0650ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
@@ -14,10 +14,10 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX6:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX6:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX6:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX6:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX6:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32)
   ; GFX6:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GFX6:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
@@ -33,10 +33,10 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GFX10NSA:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10NSA:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -56,10 +56,10 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32
   ; GFX6:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX6:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX6:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX6:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX6:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX6:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX6:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX6:   [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32)
   ; GFX6:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
@@ -78,10 +78,10 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32
   ; GFX10NSA:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10NSA:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10NSA:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10NSA:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10NSA:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10NSA:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10NSA:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; GFX10NSA:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 0 :: (dereferenceable load 4 from custom "ImageResource")
   ; GFX10NSA:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
index 0e62bb25f6ba..209bd73220fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.a16.ll
@@ -14,14 +14,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -40,14 +40,14 @@ define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10:   $vgpr0 = COPY [[UV]](s32)
@@ -72,14 +72,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -101,14 +101,14 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -136,15 +136,15 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -170,15 +170,15 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -210,15 +210,15 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -244,15 +244,15 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -284,14 +284,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -313,14 +313,14 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -348,15 +348,15 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -382,15 +382,15 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -422,14 +422,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -453,14 +453,14 @@ define amdgpu_ps <4 x float> @sample_c_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -490,15 +490,15 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -522,15 +522,15 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -560,14 +560,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -589,14 +589,14 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -624,15 +624,15 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -658,15 +658,15 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -698,15 +698,15 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -730,15 +730,15 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -768,16 +768,16 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -804,16 +804,16 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -845,14 +845,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -876,14 +876,14 @@ define amdgpu_ps <4 x float> @sample_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -913,15 +913,15 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -945,15 +945,15 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -983,15 +983,15 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1016,15 +1016,15 @@ define amdgpu_ps <4 x float> @sample_c_b_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1054,16 +1054,16 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1088,16 +1088,16 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1127,15 +1127,15 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1159,15 +1159,15 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1197,16 +1197,16 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1233,16 +1233,16 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1274,16 +1274,16 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1308,16 +1308,16 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1347,17 +1347,17 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1385,17 +1385,17 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1428,15 +1428,15 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -1463,15 +1463,15 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -1503,18 +1503,18 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -1543,18 +1543,18 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -1588,10 +1588,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1601,8 +1603,6 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
@@ -1638,10 +1638,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1651,8 +1653,6 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
@@ -1693,16 +1693,16 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -1730,16 +1730,16 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -1772,10 +1772,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1783,8 +1785,6 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1814,10 +1814,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1825,8 +1827,6 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -1861,16 +1861,16 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -1898,16 +1898,16 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -1940,10 +1940,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1951,8 +1953,6 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -1984,10 +1984,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -1995,8 +1997,6 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -2033,17 +2033,17 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2072,17 +2072,17 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2116,10 +2116,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2128,8 +2130,6 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2162,10 +2162,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2174,8 +2176,6 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2213,15 +2213,15 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -2248,15 +2248,15 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -2288,18 +2288,18 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -2328,18 +2328,18 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -2373,16 +2373,16 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2410,16 +2410,16 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2452,10 +2452,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2463,8 +2465,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2494,10 +2494,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2505,8 +2507,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2541,16 +2541,16 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -2578,16 +2578,16 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -2620,10 +2620,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2631,8 +2633,6 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -2664,10 +2664,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2675,8 +2677,6 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -2713,17 +2713,17 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2752,17 +2752,17 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -2796,10 +2796,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2808,8 +2810,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2842,10 +2842,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -2854,8 +2856,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -2893,14 +2893,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -2922,14 +2922,14 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -2957,15 +2957,15 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -2991,15 +2991,15 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
@@ -3031,15 +3031,15 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3063,15 +3063,15 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3101,16 +3101,16 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3137,16 +3137,16 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3178,14 +3178,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX9:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX9:   $vgpr0 = COPY [[UV]](s32)
@@ -3204,14 +3204,14 @@ define amdgpu_ps <4 x float> @sample_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY12]](s32)
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.lz.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 3 :: (dereferenceable load 16 from custom "ImageResource")
   ; GFX10:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>)
   ; GFX10:   $vgpr0 = COPY [[UV]](s32)
@@ -3236,14 +3236,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -3265,14 +3265,14 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32)
@@ -3300,14 +3300,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -3331,14 +3331,14 @@ define amdgpu_ps <4 x float> @sample_c_lz_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -3368,15 +3368,15 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX9:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3400,15 +3400,15 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3438,10 +3438,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -3451,8 +3453,6 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3482,10 +3482,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -3495,8 +3497,6 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3531,10 +3531,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX9:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -3544,8 +3546,6 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX9:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX9:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX9:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX9:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX9:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -3577,10 +3577,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -3590,8 +3592,6 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index 600df521e23d..12ae4a3166d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -13,15 +13,15 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -51,18 +51,18 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -93,10 +93,12 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -106,8 +108,6 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY21]](s32), [[COPY22]](s32)
@@ -143,16 +143,16 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -183,10 +183,12 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -194,8 +196,6 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -227,16 +227,16 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -266,10 +266,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -277,8 +279,6 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -309,17 +309,17 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -350,10 +350,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -362,8 +364,6 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -395,15 +395,15 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[DEF]](s32)
@@ -433,18 +433,18 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32)
@@ -475,16 +475,16 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -515,10 +515,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -526,8 +528,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -559,16 +559,16 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32)
@@ -598,10 +598,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -609,8 +611,6 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY19]](s32), [[COPY20]](s32)
@@ -641,17 +641,17 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr4
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -682,10 +682,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -694,8 +696,6 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -727,10 +727,12 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -740,8 +742,6 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)
@@ -770,10 +770,12 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $sgpr10
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr11
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $sgpr12
   ; GFX10:   [[COPY11:%[0-9]+]]:_(s32) = COPY $sgpr13
+  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr2
@@ -783,8 +785,6 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr6
   ; GFX10:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr7
   ; GFX10:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr8
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
-  ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY12]](s32)
   ; GFX10:   [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY13]](s32)
   ; GFX10:   [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index 740b5438f57d..3700c238905a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -16,11 +16,11 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; UNPACKED:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "ImageResource")
   ; UNPACKED:   S_ENDPGM 0
@@ -35,11 +35,11 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
   ; GFX81:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX81:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX81:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX81:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX81:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX81:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX81:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "ImageResource")
   ; GFX81:   S_ENDPGM 0
@@ -54,11 +54,11 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX9:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "ImageResource")
   ; GFX9:   S_ENDPGM 0
@@ -73,11 +73,11 @@ define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, ha
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32)
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "ImageResource")
   ; GFX10:   S_ENDPGM 0
@@ -97,10 +97,10 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
   ; UNPACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
@@ -121,10 +121,10 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX81:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX81:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX81:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX81:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX81:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX81:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
   ; GFX81:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
@@ -142,10 +142,10 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "ImageResource")
   ; GFX9:   S_ENDPGM 0
@@ -160,10 +160,10 @@ define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "ImageResource")
   ; GFX10:   S_ENDPGM 0
@@ -183,11 +183,11 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; UNPACKED:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; UNPACKED:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
   ; UNPACKED:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
@@ -213,11 +213,11 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX81:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX81:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX81:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX81:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX81:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX81:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GFX81:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
   ; GFX81:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
@@ -257,11 +257,11 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
   ; GFX9:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
@@ -291,11 +291,11 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
   ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>)
   ; GFX10:   [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>)
@@ -330,11 +330,11 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; UNPACKED:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; UNPACKED:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; UNPACKED:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; UNPACKED:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; UNPACKED:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; UNPACKED:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; UNPACKED:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; UNPACKED:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; UNPACKED:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>)
   ; UNPACKED:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
@@ -359,11 +359,11 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX81:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX81:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX81:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX81:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX81:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX81:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX81:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX81:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
   ; GFX81:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX81:   [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s16>)
@@ -383,11 +383,11 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX9:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX9:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX9:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX9:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX9:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX9:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX9:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX9:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
   ; GFX9:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX9:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "ImageResource")
@@ -403,11 +403,11 @@ define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX10:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7
   ; GFX10:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8
   ; GFX10:   [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9
+  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; GFX10:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; GFX10:   [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
   ; GFX10:   [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
-  ; GFX10:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>)
   ; GFX10:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX10:   G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "ImageResource")

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
index 3de0fbae9a31..f2bd8745d1dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll
@@ -11,9 +11,9 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_vof
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -31,9 +31,9 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -48,14 +48,14 @@ define amdgpu_ps <2 x float> @raw_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub0
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN]].sub1
@@ -73,14 +73,14 @@ define amdgpu_ps void @raw_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__vgp
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_X2_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_OFFEN_RTN [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -98,9 +98,9 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -147,9 +147,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -192,9 +192,9 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_vof
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -214,9 +214,9 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_vof
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[BUFFER_ATOMIC_ADD_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_OFFEN_RTN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index baa3a7167073..a25855ed8c22 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -12,9 +12,9 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0
@@ -36,9 +36,9 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__vgpr_val__vgpr_cmp__
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0
@@ -59,9 +59,9 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -112,9 +112,9 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -161,9 +161,9 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN]].sub0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
index d3d4569243df..fd57f013db33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll
@@ -12,9 +12,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -25,9 +25,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -43,9 +43,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset
@@ -56,9 +56,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
@@ -75,8 +75,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset
@@ -87,8 +87,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
@@ -105,8 +105,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
@@ -117,8 +117,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_v
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -136,9 +136,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
   ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX908:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -177,9 +177,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX90A:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -224,8 +224,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
   ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX908:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX908:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -263,8 +263,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
   ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX90A:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX90A:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -306,9 +306,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
@@ -319,9 +319,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 4095
@@ -339,9 +339,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
@@ -352,9 +352,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgp
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
@@ -370,9 +370,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -383,9 +383,9 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__v
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -401,8 +401,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
@@ -413,8 +413,8 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
index 1343cb3cd791..ad616e43e400 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
@@ -11,9 +11,9 @@ define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -24,9 +24,9 @@ define amdgpu_ps half @raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -42,9 +42,9 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -55,9 +55,9 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1
@@ -90,9 +90,9 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
@@ -106,9 +106,9 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1
@@ -147,9 +147,9 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -187,9 +187,9 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -231,9 +231,9 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
@@ -247,9 +247,9 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index 6285932fc751..2a91b3689d9c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -10,9 +10,9 @@ define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -28,9 +28,9 @@ define amdgpu_ps <2 x float> @raw_buffer_load_format_v2f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1
@@ -49,9 +49,9 @@ define amdgpu_ps <3 x float> @raw_buffer_load_format_v3f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1
@@ -72,9 +72,9 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
@@ -99,9 +99,9 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -143,9 +143,9 @@ define amdgpu_ps <4 x float> @raw_buffer_load_format_v4f32__sgpr_rsrc__vgpr_voff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
index b5d0b1725be0..a9e2e445d35f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll
@@ -11,9 +11,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -30,9 +30,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__sgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -51,9 +51,9 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -93,9 +93,9 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -137,9 +137,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -156,9 +156,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -175,9 +175,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -194,9 +194,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -213,9 +213,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -232,9 +232,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -251,9 +251,9 @@ define amdgpu_ps <2 x float> @raw_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
@@ -272,9 +272,9 @@ define amdgpu_ps <3 x float> @raw_buffer_load_v3f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1
@@ -295,9 +295,9 @@ define amdgpu_ps <4 x float> @raw_buffer_load_v4f32__sgpr_rsrc__vgpr_voffset__sg
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
@@ -320,9 +320,9 @@ define amdgpu_ps half @raw_buffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -338,9 +338,9 @@ define amdgpu_ps <2 x half> @raw_buffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -362,9 +362,9 @@ define amdgpu_ps <4 x half> @raw_buffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1
@@ -383,9 +383,9 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -403,9 +403,9 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec
   ; CHECK:   $vgpr0 = COPY [[V_BFE_I32_e64_]]
@@ -426,9 +426,9 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -468,9 +468,9 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -510,8 +510,8 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vdpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -527,8 +527,8 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -544,8 +544,8 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4)
@@ -563,9 +563,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -582,9 +582,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -601,9 +601,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
@@ -623,8 +623,8 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -641,8 +641,8 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -659,9 +659,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -680,9 +680,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -701,9 +701,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -724,9 +724,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -769,9 +769,9 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %13:vgpr_32, dead %35:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index b084e93ab65d..31724a1b5cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -10,10 +10,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
@@ -23,10 +23,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -41,9 +41,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
@@ -53,9 +53,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
@@ -70,10 +70,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
@@ -87,10 +87,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -105,11 +105,11 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
@@ -125,12 +125,12 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -147,11 +147,11 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
@@ -190,12 +190,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; PACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -232,9 +232,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; UNPACKED:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
@@ -249,9 +249,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
@@ -267,9 +267,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; UNPACKED:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
@@ -284,9 +284,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
@@ -302,10 +302,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
@@ -319,10 +319,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
@@ -338,10 +338,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
@@ -355,10 +355,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
@@ -374,10 +374,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   %11:vgpr_32, dead %24:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -394,10 +394,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; PACKED:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -419,11 +419,11 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   %13:vgpr_32, dead %49:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
@@ -465,12 +465,12 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; PACKED:   %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index 762708fbac94..574c4efc5bca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -10,10 +10,10 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -28,9 +28,9 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
@@ -45,12 +45,12 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -65,13 +65,13 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
   ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -86,14 +86,14 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -109,14 +109,14 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -153,11 +153,11 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -173,11 +173,11 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
+  ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -193,12 +193,12 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
@@ -214,12 +214,12 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
@@ -235,12 +235,12 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
@@ -262,14 +262,14 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %15:vgpr_32, dead %35:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index c6160e342036..43a27d0c33b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -11,10 +11,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -30,10 +30,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr8
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
@@ -52,10 +52,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -94,10 +94,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
   ; CHECK: bb.2:
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
@@ -126,10 +126,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -169,10 +169,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
@@ -187,10 +187,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
@@ -205,10 +205,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
@@ -223,10 +223,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
@@ -241,10 +241,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
@@ -259,10 +259,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
@@ -277,10 +277,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
@@ -295,12 +295,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -315,13 +315,13 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
   ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -336,14 +336,14 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -358,10 +358,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
@@ -377,10 +377,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
@@ -396,10 +396,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -414,10 +414,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -432,12 +432,12 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -453,12 +453,12 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -495,9 +495,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
@@ -512,9 +512,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
@@ -531,10 +531,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
@@ -550,10 +550,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
@@ -569,10 +569,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -591,9 +591,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -609,9 +609,9 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -627,10 +627,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
@@ -646,10 +646,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
@@ -665,10 +665,10 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -689,10 +689,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   %13:vgpr_32, dead %33:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -735,9 +735,9 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
index e0effb3c9a16..1c4775109ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
@@ -10,9 +10,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -23,9 +23,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -41,9 +41,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1
@@ -65,9 +65,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -89,9 +89,9 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1
@@ -123,9 +123,9 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1
@@ -145,9 +145,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -185,9 +185,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -229,9 +229,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -242,9 +242,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -260,9 +260,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -273,9 +273,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -291,9 +291,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -304,9 +304,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -322,9 +322,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]]
   ; UNPACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -335,9 +335,9 @@ define amdgpu_ps half @raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]]
   ; PACKED:   SI_RETURN_TO_EPILOG implicit $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
index 9237be3f843e..7472e3abca87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
@@ -9,9 +9,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -27,9 +27,9 @@ define amdgpu_ps <2 x float> @raw_tbuffer_load_v2f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1
@@ -48,9 +48,9 @@ define amdgpu_ps <3 x float> @raw_tbuffer_load_v3f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1
@@ -71,9 +71,9 @@ define amdgpu_ps <4 x float> @raw_tbuffer_load_v4f32__sgpr_rsrc__vgpr_voffset__s
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1
@@ -97,9 +97,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -141,9 +141,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -159,9 +159,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -177,9 +177,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -195,9 +195,9 @@ define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]]
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
index 1487ec3e0b28..7a181e849a60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll
@@ -11,9 +11,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -24,9 +24,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -42,9 +42,9 @@ define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec
@@ -59,9 +59,9 @@ define amdgpu_ps void @raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -84,9 +84,9 @@ define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec
@@ -100,14 +100,14 @@ define amdgpu_ps void @raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
   ; PACKED:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -125,9 +125,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; UNPACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -161,9 +161,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; PACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -203,9 +203,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; UNPACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -242,9 +242,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; PACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -287,9 +287,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -327,9 +327,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -371,9 +371,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc
@@ -384,9 +384,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
@@ -402,9 +402,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc
@@ -415,9 +415,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
@@ -433,9 +433,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc
@@ -446,9 +446,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
@@ -464,9 +464,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc
@@ -477,9 +477,9 @@ define amdgpu_ps void @raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
index ab83fb3387a5..589351e55ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll
@@ -11,9 +11,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -24,9 +24,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0)
@@ -44,9 +44,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; UNPACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -80,9 +80,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; PACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -122,9 +122,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; UNPACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -161,9 +161,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; PACKED:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -206,9 +206,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -246,9 +246,9 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; PACKED:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; PACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
index d0cf50a9062c..2228c2b714f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll
@@ -11,9 +11,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -27,14 +27,14 @@ define amdgpu_ps void @raw_tbuffer_store_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -49,14 +49,14 @@ define amdgpu_ps void @raw_tbuffer_store_v3f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -72,14 +72,14 @@ define amdgpu_ps void @raw_tbuffer_store_v4f32__sgpr_rsrc__vgpr_voffset__sgpr_so
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0)
@@ -96,9 +96,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__sgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -117,9 +117,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
@@ -159,9 +159,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; CHECK:   [[S_MOV_B32_term:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32_term $exec_lo
@@ -204,9 +204,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -249,9 +249,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1)
@@ -268,9 +268,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2)
@@ -287,9 +287,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3)
@@ -306,9 +306,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4)
@@ -326,8 +326,8 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vdpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0)
@@ -343,8 +343,8 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0)
@@ -360,8 +360,8 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4)
@@ -379,9 +379,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 16
@@ -398,9 +398,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset = add i32 %voffset.base, 4095
@@ -417,9 +417,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
@@ -439,8 +439,8 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -457,8 +457,8 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -475,9 +475,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
@@ -496,9 +496,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
@@ -517,9 +517,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
@@ -540,9 +540,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
   ; CHECK:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc
   ; CHECK:   [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -585,9 +585,9 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index ea73e6255245..c4acb32e9a96 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -14,8 +14,8 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX6:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -28,8 +28,8 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX7:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -42,8 +42,8 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX8:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -61,8 +61,8 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX6:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -75,8 +75,8 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX7:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -89,8 +89,8 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
   ; GFX8:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
@@ -108,8 +108,8 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1
@@ -127,8 +127,8 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1
@@ -146,8 +146,8 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1
@@ -170,8 +170,8 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
   ; GFX6:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX6:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
@@ -197,8 +197,8 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
   ; GFX7:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX7:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
@@ -224,8 +224,8 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORDX4_SGPR:%[0-9]+]]:sgpr_128 = S_BUFFER_LOAD_DWORDX4_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 12, align 4)
   ; GFX8:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; GFX8:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_512 = REG_SEQUENCE [[S_BUFFER_LOAD_DWORDX4_SGPR]], %subreg.sub0_sub1_sub2_sub3, [[DEF]], %subreg.sub4_sub5_sub6_sub7, [[DEF]], %subreg.sub8_sub9_sub10_sub11
@@ -256,8 +256,8 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 32, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1
@@ -299,8 +299,8 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 32, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1
@@ -342,8 +342,8 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORDX8_SGPR:%[0-9]+]]:sgpr_256 = S_BUFFER_LOAD_DWORDX8_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 32, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub1
@@ -390,8 +390,8 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX6:   [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 64, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0
   ; GFX6:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1
@@ -465,8 +465,8 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX7:   [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 64, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0
   ; GFX7:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1
@@ -540,8 +540,8 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; GFX8:   [[S_BUFFER_LOAD_DWORDX16_SGPR:%[0-9]+]]:sgpr_512 = S_BUFFER_LOAD_DWORDX16_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 64, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub0
   ; GFX8:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub1
@@ -1488,8 +1488,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -1501,8 +1501,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -1514,8 +1514,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -1532,8 +1532,8 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
@@ -1548,8 +1548,8 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
@@ -1564,8 +1564,8 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0
@@ -1585,8 +1585,8 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
@@ -1608,8 +1608,8 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
@@ -1631,8 +1631,8 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
@@ -1659,8 +1659,8 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -1679,8 +1679,8 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -1699,8 +1699,8 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -1724,8 +1724,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1754,8 +1754,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1784,8 +1784,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1819,8 +1819,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1867,8 +1867,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1915,8 +1915,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -1968,8 +1968,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -1981,8 +1981,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -1994,8 +1994,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2013,8 +2013,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2026,8 +2026,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2039,8 +2039,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2058,8 +2058,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2071,8 +2071,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2084,8 +2084,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
@@ -2104,8 +2104,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2134,8 +2134,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2164,8 +2164,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2201,8 +2201,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2231,8 +2231,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2261,8 +2261,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2297,8 +2297,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2345,8 +2345,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2393,8 +2393,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2447,8 +2447,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX6:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2495,8 +2495,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX7:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2543,8 +2543,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
   ; GFX8:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4)
@@ -2598,8 +2598,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -2633,8 +2633,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -2668,8 +2668,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -2709,8 +2709,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX6:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -2742,8 +2742,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX7:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -2775,8 +2775,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX8:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -2815,8 +2815,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX6:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -2852,8 +2852,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX7:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -2889,8 +2889,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX8:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3148,8 +3148,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX6:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -3198,8 +3198,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX7:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -3248,8 +3248,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
   ; GFX8:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -3306,8 +3306,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
   ; GFX6:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3360,8 +3360,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
   ; GFX7:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3414,8 +3414,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068
   ; GFX8:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3474,8 +3474,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX6:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX6:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3528,8 +3528,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX7:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX7:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3582,8 +3582,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
   ; GFX8:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX8:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
@@ -3642,8 +3642,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3693,8 +3693,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5000
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3744,8 +3744,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4064
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3801,8 +3801,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3852,8 +3852,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4076
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3903,8 +3903,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 12
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -3960,8 +3960,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX6:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX6:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX6:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080
   ; GFX6:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX6:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -4011,8 +4011,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX7:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX7:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX7:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4080
   ; GFX7:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX7:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -4062,8 +4062,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GFX8:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX8:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX8:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; GFX8:   [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
   ; GFX8:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3
@@ -4274,9 +4274,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4287,9 +4287,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4300,9 +4300,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4319,9 +4319,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX6:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX6:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4332,9 +4332,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX7:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX7:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4345,9 +4345,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
   ; GFX8:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
   ; GFX8:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -4364,9 +4364,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX6:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4380,9 +4380,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX7:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4396,9 +4396,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX8:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4419,9 +4419,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX6:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4435,9 +4435,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX7:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4451,9 +4451,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX8:   %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
@@ -4475,9 +4475,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX6:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX6:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
@@ -4490,9 +4490,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX7:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX7:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
@@ -4505,9 +4505,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX8:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc
   ; GFX8:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4)
@@ -4527,9 +4527,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX6:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX6:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX6:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX6:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX6:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX6:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX6:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX6:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
@@ -4543,9 +4543,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX7:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX7:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX7:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX7:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX7:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX7:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX7:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX7:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
@@ -4559,9 +4559,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GFX8:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX8:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX8:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX8:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX8:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; GFX8:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
   ; GFX8:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX8:   %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
index 2a361f0315d0..bfffdea59c06 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll
@@ -11,10 +11,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
@@ -33,10 +33,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32_noret__vgpr_val__sgpr_rsrc_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]
@@ -52,15 +52,15 @@ define amdgpu_ps <2 x float> @struct_buffer_atomic_add_i64__vgpr_val__sgpr_rsrc_
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN]].sub0
@@ -79,15 +79,15 @@ define amdgpu_ps void @struct_buffer_atomic_add_i64_noret__vgpr_val__sgpr_rsrc__
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_BOTHEN_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 8 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -106,10 +106,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -158,10 +158,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -207,10 +207,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__vgpr_val__sgpr_rsrc__vgpr_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_ADD_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_BOTHEN_RTN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index 0f8186ef8bbd..0da9728ae476 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -13,10 +13,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
@@ -39,10 +39,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i32__vgpr_val__vgpr_cm
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
@@ -64,10 +64,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -120,10 +120,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
   ; CHECK:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -172,10 +172,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY8]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
index dbea49cb876e..c8ebea7c9e64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll
@@ -12,10 +12,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
@@ -27,10 +27,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
@@ -47,10 +47,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
@@ -62,10 +62,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
@@ -83,9 +83,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset
@@ -96,9 +96,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0)
@@ -115,9 +115,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
@@ -128,9 +128,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
@@ -148,10 +148,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX908:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX908:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -192,10 +192,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX90A:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX90A:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -242,9 +242,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX908:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX908:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -283,9 +283,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; GFX90A:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; GFX90A:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -329,10 +329,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
@@ -344,10 +344,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
@@ -364,9 +364,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc
@@ -377,9 +377,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2)
@@ -395,10 +395,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
@@ -410,10 +410,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX90A:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
@@ -430,9 +430,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
   ; GFX908:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX908:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX908:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX908:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX908:   S_ENDPGM 0
   ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset
@@ -443,9 +443,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc
   ; GFX90A:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; GFX90A:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GFX90A:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; GFX90A:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A:   [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4)
   ; GFX90A:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
index 88d6711ddcd1..0039097ca142 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
@@ -10,10 +10,10 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
@@ -25,10 +25,10 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
@@ -45,10 +45,10 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0
@@ -71,10 +71,10 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN]]
@@ -97,10 +97,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0
@@ -133,10 +133,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0
@@ -158,10 +158,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -222,10 +222,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; PACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -272,10 +272,10 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
@@ -287,10 +287,10 @@ define amdgpu_ps half @struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
@@ -308,10 +308,10 @@ define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vg
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
@@ -323,10 +323,10 @@ define amdgpu_ps half @struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vg
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
index 1c225364bbae..27fe9ba96de9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
@@ -10,10 +10,10 @@ define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
@@ -30,10 +30,10 @@ define amdgpu_ps <2 x float> @struct_buffer_load_format_v2f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
@@ -53,10 +53,10 @@ define amdgpu_ps <3 x float> @struct_buffer_load_format_v3f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
@@ -78,10 +78,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__sgpr_rsrc__vgpr_v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
@@ -107,10 +107,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -161,10 +161,10 @@ define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
@@ -182,10 +182,10 @@ define amdgpu_ps float @struct_buffer_load_format_i32__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
index e736692d6086..6fcb30ac51e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll
@@ -11,10 +11,10 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
@@ -32,10 +32,10 @@ define amdgpu_ps <2 x float> @struct_buffer_load_v2f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0
@@ -56,10 +56,10 @@ define amdgpu_ps <3 x float> @struct_buffer_load_v3f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0
@@ -82,10 +82,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_v4f32__sgpr_rsrc__vgpr_vindex__
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0
@@ -110,9 +110,9 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
@@ -132,10 +132,10 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
@@ -153,9 +153,9 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -175,10 +175,10 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -222,10 +222,10 @@ define amdgpu_ps float @struct_buffer_load_i8_zext__sgpr_rsrc__vgpr_vindex__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]]
@@ -244,10 +244,10 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 8, implicit $exec
@@ -267,10 +267,10 @@ define amdgpu_ps float @struct_buffer_load_i16_zext__sgpr_rsrc__vgpr_vindex__vgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
@@ -289,10 +289,10 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 16, implicit $exec
@@ -313,10 +313,10 @@ define amdgpu_ps half @struct_buffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voff
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]]
@@ -334,10 +334,10 @@ define amdgpu_ps <2 x half> @struct_buffer_load_v2f16__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]
@@ -361,10 +361,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_v4f16__sgpr_rsrc__vgpr_vindex__v
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0
@@ -385,10 +385,10 @@ define amdgpu_ps float @struct_buffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
index 2276bdde843f..3712975a4995 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll
@@ -11,10 +11,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
@@ -26,10 +26,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
@@ -46,10 +46,10 @@ define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vg
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec
@@ -65,10 +65,10 @@ define amdgpu_ps void @struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vg
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
@@ -92,10 +92,10 @@ define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vg
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; UNPACKED:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY]], implicit $exec
@@ -110,15 +110,15 @@ define amdgpu_ps void @struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vg
   ; PACKED:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; PACKED:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; PACKED:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
@@ -136,10 +136,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; UNPACKED:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -180,10 +180,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; PACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; PACKED:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -228,10 +228,10 @@ define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
@@ -243,10 +243,10 @@ define amdgpu_ps void @struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; PACKED:   BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
index 03a37e4ed88e..7a540d859a9d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll
@@ -10,10 +10,10 @@ define amdgpu_ps void @struct_buffer_store_format_f32__vgpr_val__sgpr_rsrc__vgpr
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -27,15 +27,15 @@ define amdgpu_ps void @struct_buffer_store_format_v2f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -50,15 +50,15 @@ define amdgpu_ps void @struct_buffer_store_format_v3f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -74,15 +74,15 @@ define amdgpu_ps void @struct_buffer_store_format_v4f32__vgpr_val__sgpr_rsrc__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -100,10 +100,10 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
@@ -148,10 +148,10 @@ define amdgpu_ps void @struct_buffer_store_format_i32__vgpr_val__sgpr_rsrc__vgpr
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index de0de1a3aa93..9dede257a02c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -12,10 +12,10 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -29,15 +29,15 @@ define amdgpu_ps void @struct_buffer_store_v2f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -52,15 +52,15 @@ define amdgpu_ps void @struct_buffer_store_v3f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -76,15 +76,15 @@ define amdgpu_ps void @struct_buffer_store_v4f32_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
   ; CHECK:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -101,15 +101,15 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
   ; CHECK:   [[COPY11:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
   ; CHECK:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
@@ -154,10 +154,10 @@ define amdgpu_ps void @struct_buffer_store_i8_sgpr_rsrc__vgpr_val__vgpr_vindex__
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -175,10 +175,10 @@ define amdgpu_ps void @struct_buffer_store_i16_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -196,10 +196,10 @@ define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex_
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -216,10 +216,10 @@ define amdgpu_ps void @struct_buffer_store_v2f16_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
@@ -239,15 +239,15 @@ define amdgpu_ps void @struct_buffer_store_v4f16_sgpr_rsrc__vgpr_val__vgpr_vinde
   ; CHECK:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3
   ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1
   ; CHECK:   BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
index 316f27dd78c9..c0049efda5ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=PACKED %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=UNPACKED %s
 
 define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; PACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
@@ -11,10 +11,10 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
@@ -26,10 +26,10 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]
@@ -46,10 +46,10 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
@@ -61,10 +61,10 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0
@@ -98,10 +98,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0
@@ -116,10 +116,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0
@@ -157,9 +157,9 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
@@ -173,9 +173,9 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
@@ -195,10 +195,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; PACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; PACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; PACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; PACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -241,10 +241,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; UNPACKED:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; UNPACKED:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; UNPACKED:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -309,10 +309,10 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; PACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; PACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; PACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; PACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; PACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]]
@@ -324,10 +324,10 @@ define amdgpu_ps half @struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_vof
   ; UNPACKED:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; UNPACKED:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; UNPACKED:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; UNPACKED:   [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; UNPACKED:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
index 9a4b1ac67474..fd7bac4a6721 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
@@ -10,10 +10,10 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]]
@@ -30,10 +30,10 @@ define amdgpu_ps <2 x float> @struct_tbuffer_load_v2f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
@@ -53,10 +53,10 @@ define amdgpu_ps <3 x float> @struct_tbuffer_load_v3f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
@@ -78,10 +78,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__sgpr_rsrc__vgpr_vindex_
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
@@ -105,9 +105,9 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1
@@ -127,10 +127,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
   ; CHECK:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1
@@ -181,10 +181,10 @@ define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_vo
   ; CHECK:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 9ce0d28b7623..901dc28db668 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -485,12 +485,12 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16:
@@ -514,11 +514,10 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_lshr_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 15, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 15, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16_15:
@@ -543,12 +542,12 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX6-LABEL: s_lshr_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s4
-; GFX6-NEXT:    s_and_b32 s0, s0, s4
-; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX6-NEXT:    s_and_b32 s0, s0, s4
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshr_b32 s1, s2, s3
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -586,13 +585,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: lshr_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
-; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog
@@ -618,13 +617,13 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: lshr_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    ; return to shader part epilog

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index b9e302ad95e5..95dc6e1cbf1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -285,15 +285,8 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
 define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_xor_b32 s0, s3, -1
+; GFX6-NEXT:    s_or_b32 s0, s2, s0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_v2i16:
@@ -309,15 +302,8 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
 define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX6-NEXT:    s_xor_b32 s0, s3, -1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_or_b32 s0, s1, s0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_v2i16_commute:
@@ -333,15 +319,8 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v2i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_and_b32 s1, s4, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
+; GFX6-NEXT:    s_xor_b32 s1, s3, -1
+; GFX6-NEXT:    s_or_b32 s0, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_v2i16_multi_use:
@@ -362,19 +341,9 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
 ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b32 s1, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s0, s3, 16
-; GFX6-NEXT:    s_and_b32 s2, s2, s1
-; GFX6-NEXT:    s_or_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s3, s4, s1
-; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
-; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
-; GFX6-NEXT:    s_and_b32 s1, s6, s1
+; GFX6-NEXT:    s_xor_b32 s1, s4, -1
+; GFX6-NEXT:    s_or_b32 s0, s2, s1
 ; GFX6-NEXT:    s_or_b32 s1, s3, s1
-; GFX6-NEXT:    s_xor_b32 s1, s1, -1
-; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_or_b32 s1, s2, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
@@ -397,13 +366,6 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffff
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v2, v2, v4
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 42d6b03772c4..597d570ed48c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -15,8 +15,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; FAST:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; FAST:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
-  ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; FAST:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
@@ -33,8 +33,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; GREEDY:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; GREEDY:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
-  ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GREEDY:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
@@ -58,8 +58,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; FAST:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; FAST:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
-  ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; FAST:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
@@ -77,8 +77,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; GREEDY:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; GREEDY:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
-  ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; GREEDY:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
@@ -104,8 +104,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; FAST:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; FAST:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
-  ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; FAST:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -158,8 +158,8 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; GREEDY:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
-  ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -219,8 +219,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; FAST:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; FAST:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
-  ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; FAST:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
@@ -274,8 +274,8 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; GREEDY:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
-  ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
+  ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
   ; GREEDY:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index c62653631e51..9ff032726c8e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -15,13 +15,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; FAST:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; FAST:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; FAST:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11
   ; FAST:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12
   ; FAST:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13
-  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; FAST:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
@@ -37,13 +37,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; GREEDY:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; GREEDY:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; GREEDY:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11
   ; GREEDY:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12
   ; GREEDY:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13
-  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
   ; GREEDY:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
@@ -66,13 +66,13 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; FAST:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; FAST:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; FAST:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11
   ; FAST:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12
   ; FAST:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13
-  ; FAST:   [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14
-  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; FAST:   [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
   ; FAST:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
@@ -89,13 +89,13 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; GREEDY:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; GREEDY:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr10
   ; GREEDY:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr11
   ; GREEDY:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr12
   ; GREEDY:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr13
-  ; GREEDY:   [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GREEDY:   [[COPY12:%[0-9]+]]:sgpr(s32) = COPY $sgpr14
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
   ; GREEDY:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load 16 from custom "ImageResource")
@@ -120,13 +120,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; FAST:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; FAST:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; FAST:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
+  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; FAST:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; FAST:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; FAST:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
-  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; FAST:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -178,13 +178,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; GREEDY:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[COPY8:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[COPY9:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY10:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY11:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -243,13 +243,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; FAST:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; FAST:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; FAST:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; FAST:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; FAST:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; FAST:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
-  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; FAST:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -291,13 +291,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; GREEDY:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; GREEDY:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -346,13 +346,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; FAST:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; FAST:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; FAST:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
+  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; FAST:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr9
   ; FAST:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr10
   ; FAST:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr11
-  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12
-  ; FAST:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; FAST:   [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; FAST:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12
   ; FAST:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; FAST:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; FAST:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
@@ -416,13 +416,13 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; GREEDY:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY $vgpr7
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr8
   ; GREEDY:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY $vgpr9
   ; GREEDY:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY $vgpr10
   ; GREEDY:   [[COPY11:%[0-9]+]]:vgpr(s32) = COPY $vgpr11
-  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; GREEDY:   [[BUILD_VECTOR1:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
+  ; GREEDY:   [[COPY12:%[0-9]+]]:vgpr(s32) = COPY $vgpr12
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF1:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; GREEDY:   [[DEF2:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 4aa0ead205bf..2fb84ebd73c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -11,9 +11,9 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY4]], [[COPY5]], 0, 0, 0 :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -32,9 +32,9 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
@@ -55,9 +55,9 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
@@ -102,9 +102,9 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
@@ -140,9 +140,9 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
index ca880839ee93..b03b6f144dbd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll
@@ -11,8 +11,8 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
   ; CHECK:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
@@ -25,8 +25,8 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4)
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32)
   ; GREEDY:   [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32)
@@ -44,8 +44,8 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -62,8 +62,8 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4)
   ; GREEDY:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>)
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -85,8 +85,8 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>)
@@ -111,8 +111,8 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4)
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(<4 x s32>) = G_IMPLICIT_DEF
   ; GREEDY:   [[CONCAT_VECTORS:%[0-9]+]]:sgpr(<12 x s32>) = G_CONCAT_VECTORS [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), [[DEF]](<4 x s32>), [[DEF]](<4 x s32>)
@@ -142,8 +142,8 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -178,8 +178,8 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4)
   ; GREEDY:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>)
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -219,8 +219,8 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
   ; CHECK:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -279,8 +279,8 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; GREEDY:   [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4)
   ; GREEDY:   [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>)
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32)
@@ -345,8 +345,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4)
@@ -359,8 +359,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 4)
@@ -378,8 +378,8 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4)
@@ -394,8 +394,8 @@ define amdgpu_ps <2 x float> @s_buffer_load_v2f32_vgpr_offset(<4 x i32> inreg %r
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 8, align 4)
@@ -415,8 +415,8 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -439,8 +439,8 @@ define amdgpu_ps <3 x float> @s_buffer_load_v3f32_vgpr_offset(<4 x i32> inreg %r
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -468,8 +468,8 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -486,8 +486,8 @@ define amdgpu_ps <4 x float> @s_buffer_load_v4f32_vgpr_offset(<4 x i32> inreg %r
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -509,8 +509,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -533,8 +533,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset(<4 x i32> inreg %r
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -562,8 +562,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -596,8 +596,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset(<4 x i32> inreg
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[C]], 0, 0, 0 :: (dereferenceable invariant load 16, align 4)
@@ -635,8 +635,8 @@ define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -651,8 +651,8 @@ define amdgpu_ps void @s_buffer_load_i96_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -674,8 +674,8 @@ define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -695,8 +695,8 @@ define amdgpu_ps void @s_buffer_load_i256_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -723,8 +723,8 @@ define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -752,8 +752,8 @@ define amdgpu_ps void @s_buffer_load_i512_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -788,8 +788,8 @@ define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -809,8 +809,8 @@ define amdgpu_ps void @s_buffer_load_v16i16_vgpr_offset(<4 x i32> inreg %rsrc, i
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -837,8 +837,8 @@ define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -866,8 +866,8 @@ define amdgpu_ps void @s_buffer_load_v32i16_vgpr_offset(<4 x i32> inreg %rsrc, i
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -902,8 +902,8 @@ define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -923,8 +923,8 @@ define amdgpu_ps void @s_buffer_load_v4i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -951,8 +951,8 @@ define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -980,8 +980,8 @@ define amdgpu_ps void @s_buffer_load_v8i64_vgpr_offset(<4 x i32> inreg %rsrc, i3
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1016,8 +1016,8 @@ define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1037,8 +1037,8 @@ define amdgpu_ps void @s_buffer_load_v4p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1065,8 +1065,8 @@ define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1094,8 +1094,8 @@ define amdgpu_ps void @s_buffer_load_v8p1_vgpr_offset(<4 x i32> inreg %rsrc, i32
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1129,8 +1129,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1146,8 +1146,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4092(<4 x i32> inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1169,8 +1169,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1186,8 +1186,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4095(<4 x i32> inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1209,8 +1209,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1225,8 +1225,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_offset_add_4096(<4 x i32> inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1248,8 +1248,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1275,8 +1275,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4064(<4 x i32>
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1309,8 +1309,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1335,8 +1335,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_add_4068(<4 x i32>
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1367,8 +1367,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1404,8 +1404,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4032(<4 x i3
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4032
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1447,8 +1447,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1483,8 +1483,8 @@ define amdgpu_ps <16 x float> @s_buffer_load_v16f32_vgpr_offset_add_4036(<4 x i3
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4036
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -1526,8 +1526,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1565,8 +1565,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1610,8 +1610,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1650,8 +1650,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4092
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -1697,8 +1697,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -1738,8 +1738,8 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -1955,8 +1955,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; CHECK:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2003,8 +2003,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4064
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; GREEDY:   [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2059,8 +2059,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2108,8 +2108,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4068
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2163,8 +2163,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2212,8 +2212,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4096
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY4]], [[C]]
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2267,8 +2267,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2315,8 +2315,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5000
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2369,8 +2369,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2417,8 +2417,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4076
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2471,8 +2471,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2519,8 +2519,8 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
   ; GREEDY:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; GREEDY:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; GREEDY:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
-  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
+  ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4080
   ; GREEDY:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY5]]
@@ -2672,9 +2672,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
   ; CHECK:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2688,9 +2688,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr(<4 x i32> inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
   ; GREEDY:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2710,9 +2710,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
   ; CHECK:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2726,9 +2726,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr(<4 x i32> inreg %
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
   ; GREEDY:   [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
@@ -2748,9 +2748,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
@@ -2768,9 +2768,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
@@ -2795,9 +2795,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
@@ -2815,9 +2815,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]]
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
@@ -2843,9 +2843,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
   ; CHECK:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]]
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2861,9 +2861,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
   ; GREEDY:   [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]]
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32)
@@ -2886,9 +2886,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]
@@ -2905,9 +2905,9 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ; GREEDY:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; GREEDY:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; GREEDY:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; GREEDY:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; GREEDY:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; GREEDY:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024
   ; GREEDY:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; GREEDY:   [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index c7cdead83cad..109143ccdfff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -11,10 +11,10 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]], [[COPY6]], 0, 0, -1 :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -31,10 +31,10 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]], [[COPY6]], 0, 0, -1 :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4)
@@ -54,10 +54,10 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
   ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
@@ -100,10 +100,10 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
   ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -137,10 +137,10 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF
   ; CHECK:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index a43959b25080..2d3d27c04bfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -11,11 +11,11 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   G_AMDGPU_BUFFER_STORE [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]], [[COPY7]], 0, 0, -1 :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -31,11 +31,11 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr8
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr9
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
@@ -55,11 +55,11 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
@@ -99,11 +99,11 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
   ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
   ; CHECK:   [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr5
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr3
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec
   ; CHECK: bb.2:
@@ -134,11 +134,11 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY $vgpr3
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr4
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr5
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr6
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr7
-  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK:   [[UV:%[0-9]+]]:vreg_64(s64), [[UV1:%[0-9]+]]:vreg_64(s64) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<4 x s32>)
   ; CHECK:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
index 09eda4be6197..80aa9e6f9239 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -176,22 +176,24 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v2f16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
-; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -221,9 +223,6 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v2f16_fneg:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -237,9 +236,6 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX7-LABEL: v_roundeven_v2f16_fneg:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index a4e7e30b42b8..c8723171fd0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -2702,19 +2702,21 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
-; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
+; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
+; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
@@ -2768,18 +2770,20 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: s_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_min_i32 s7, s0, 0
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s5, s7
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, 0
 ; GFX6-NEXT:    s_sub_i32 s6, s4, s6
-; GFX6-NEXT:    s_max_i32 s2, s7, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s6
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
+; GFX6-NEXT:    s_max_i32 s1, s7, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s6
+; GFX6-NEXT:    s_add_i32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_max_i32 s3, s1, 0
 ; GFX6-NEXT:    s_sub_i32 s3, s4, s3
@@ -2854,9 +2858,11 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: saddsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_brev_b32 s3, 1
 ; GFX6-NEXT:    s_min_i32 s5, s0, 0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_sub_i32 s5, s3, s5
 ; GFX6-NEXT:    s_brev_b32 s2, -2
@@ -2927,9 +2933,11 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: saddsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_brev_b32 s3, 1
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
 ; GFX6-NEXT:    s_brev_b32 s2, -2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index dd9343579967..b3d374dfe5e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -454,13 +454,9 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
 define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s2, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s2
-; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s2
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -502,13 +498,9 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index ff16d8a6fffa..bef27f930896 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -485,11 +485,11 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v3, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i16:
@@ -513,6 +513,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_shl_v2i16_15:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 15, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -539,10 +540,11 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 ; GFX6-LABEL: s_shl_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_mov_b32 s4, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s2, s4
-; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s2, s3, s4
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s2
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_and_b32 s1, s1, s4
+; GFX6-NEXT:    s_lshl_b32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, s3
 ; GFX6-NEXT:    s_and_b32 s1, s1, s4
 ; GFX6-NEXT:    s_and_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
@@ -579,8 +581,9 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: shl_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
-; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
@@ -611,11 +614,12 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: shl_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_mov_b32 s2, 0xffff
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    s_and_b32 s0, s0, s2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
-; GFX6-NEXT:    s_and_b32 s0, s1, s2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 945bac091858..35de998cdbd7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -2688,18 +2688,20 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
-; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
+; GFX6-NEXT:    v_min_i32_e32 v1, v1, v5
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
@@ -2754,18 +2756,20 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: s_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    s_max_i32 s6, s0, -1
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s4
 ; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    s_min_i32 s7, s0, -1
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s5
-; GFX6-NEXT:    s_max_i32 s2, s6, s2
-; GFX6-NEXT:    s_min_i32 s2, s2, s7
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_sub_i32 s0, s0, s2
+; GFX6-NEXT:    s_max_i32 s1, s6, s1
+; GFX6-NEXT:    s_min_i32 s1, s1, s7
+; GFX6-NEXT:    s_sub_i32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_max_i32 s3, s1, -1
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
@@ -2840,9 +2844,11 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: ssubsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    s_max_i32 s4, s0, -1
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_sub_i32 s4, s4, s2
 ; GFX6-NEXT:    s_brev_b32 s3, 1
@@ -2913,9 +2919,11 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: ssubsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
 ; GFX6-NEXT:    s_brev_b32 s3, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 06232cec3309..6c1acb69cb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -1748,12 +1748,14 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v0
-; GFX6-NEXT:    v_min_u32_e32 v2, v4, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v0
+; GFX6-NEXT:    v_min_u32_e32 v1, v4, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v1
 ; GFX6-NEXT:    v_min_u32_e32 v2, v3, v2
@@ -1791,12 +1793,14 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: s_uaddsat_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_not_b32 s4, s0
-; GFX6-NEXT:    s_min_u32 s2, s4, s2
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
+; GFX6-NEXT:    s_not_b32 s4, s0
+; GFX6-NEXT:    s_min_u32 s1, s4, s1
+; GFX6-NEXT:    s_add_i32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_not_b32 s3, s1
 ; GFX6-NEXT:    s_min_u32 s2, s3, s2
@@ -1841,7 +1845,9 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: uaddsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_not_b32 s2, s0
 ; GFX6-NEXT:    v_min_u32_e32 v0, s2, v0
@@ -1885,7 +1891,9 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: uaddsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_min_u32_e32 v2, s0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index cf3427dac88f..a41c88d53bee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -1662,11 +1662,13 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_min_u32_e32 v2, v0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_min_u32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -1703,11 +1705,13 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: s_usubsat_v2i16:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX6-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_min_u32 s2, s0, s2
-; GFX6-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_min_u32 s1, s0, s1
+; GFX6-NEXT:    s_sub_i32 s0, s0, s1
+; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_min_u32 s2, s1, s2
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
@@ -1751,6 +1755,8 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: usubsat_v2i16_sv:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_u32_e32 v0, s0, v0
@@ -1793,6 +1799,8 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: usubsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX6-NEXT:    v_min_u32_e32 v2, s0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 0ba151d33bff..17d199e31963 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -19,13 +19,6 @@ entry:
 define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX7-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 s4, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX7-NEXT:    s_and_b32 s0, s0, s4
-; GFX7-NEXT:    s_or_b32 s0, s1, s0
-; GFX7-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX7-NEXT:    s_and_b32 s2, s2, s4
-; GFX7-NEXT:    s_or_b32 s1, s1, s2
 ; GFX7-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX7-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX7-NEXT:    ; return to shader part epilog