[llvm] fdaad48 - AMDGPU/GlobalISel: Initial selection of MUBUF addr64 load/store

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 27 07:14:06 PST 2020


Author: Matt Arsenault
Date: 2020-01-27T07:13:56-08:00
New Revision: fdaad485e620de39ea578e02535c6e75e44581ff

URL: https://github.com/llvm/llvm-project/commit/fdaad485e620de39ea578e02535c6e75e44581ff
DIFF: https://github.com/llvm/llvm-project/commit/fdaad485e620de39ea578e02535c6e75e44581ff.diff

LOG: AMDGPU/GlobalISel: Initial selection of MUBUF addr64 load/store

Fixes the main reason for compile failures on SI, but doesn't really
try to use the addressing modes yet.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUGISel.td
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-inttoptr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 44b27dde14cc..5cbbc283821f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -84,6 +84,10 @@ def gi_ds_1addr_1offset :
     GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
     GIComplexPatternEquiv<DS1Addr1Offset>;
 
+def gi_mubuf_addr64 :
+    GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
+    GIComplexPatternEquiv<MUBUFAddr64>;
+
 
 // Separate load nodes are defined to glue m0 initialization in
 // SelectionDAG. The GISel selector can just insert m0 initialization

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2b6308dc1549..eb4c4e0eba01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1343,6 +1343,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &TFE, SDValue &DLC,
                                      SDValue &SWZ) const {
   // Subtarget prefers to use flat instruction
+  // FIXME: This should be a pattern predicate and not reach here
   if (Subtarget->useFlatForGlobal())
     return false;
 
@@ -1438,6 +1439,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   SDValue Ptr, Offen, Idxen, Addr64;
 
   // addr64 bit was removed for volcanic islands.
+  // FIXME: This should be a pattern predicate and not reach here
   if (!Subtarget->hasAddr64())
     return false;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ce47e56da02a..c580e72aefbb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2552,6 +2552,84 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
     }};
 }
 
+static void addZeroImm(MachineInstrBuilder &MIB) {
+  MIB.addImm(0);
+}
+
+/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
+/// BasePtr is not valid, a null base pointer will be ussed.
+static Register buildRSrc(MachineInstr *MI, MachineRegisterInfo &MRI,
+                          const SIInstrInfo &TII, Register BasePtr) {
+  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
+
+  const DebugLoc &DL = MI->getDebugLoc();
+  MachineBasicBlock *BB = MI->getParent();
+
+  // TODO: Try to use a real pointer if available.
+  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_MOV_B32), RSrc2)
+    .addImm(0);
+  BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_MOV_B32), RSrc3)
+    .addImm(TII.getDefaultRsrcDataFormat() >> 32);
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  BuildMI(*BB, MI, DL, TII.get(AMDGPU::REG_SEQUENCE), RSrcHi)
+    .addReg(RSrc2)
+    .addImm(AMDGPU::sub0)
+    .addReg(RSrc3)
+    .addImm(AMDGPU::sub1);
+
+  Register RSrcLo = BasePtr;
+  if (!BasePtr) {
+    RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_MOV_B64), RSrcLo)
+      .addImm(0);
+  }
+
+  BuildMI(*BB, MI, DL, TII.get(AMDGPU::REG_SEQUENCE), RSrc)
+    .addReg(RSrcLo)
+    .addImm(AMDGPU::sub0_sub1)
+    .addReg(RSrcHi)
+    .addImm(AMDGPU::sub2_sub3);
+
+  return RSrc;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
+  // FIXME: Predicates should stop this from reaching here.
+  // addr64 bit was removed for volcanic islands.
+  if (!STI.hasAddr64() || STI.useFlatForGlobal())
+    return {};
+
+  MachineInstr *MI = MRI->getVRegDef(Root.getReg());
+  Register VAddr = Root.getReg();
+  int64_t Offset = 0;
+
+  // TODO: Attempt to use addressing modes. We need to look back through regbank
+  // copies to find a 64-bit SGPR base and VGPR offset.
+
+  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+  // pattern.
+  return {{
+      [=](MachineInstrBuilder &MIB) {  // rsrc
+        MIB.addReg(buildRSrc(MI, *MRI, TII, Register()));
+      },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(VAddr); },  // vaddr
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // soffset
+      addZeroImm, //  offset
+      addZeroImm, //  glc
+      addZeroImm, //  slc
+      addZeroImm, //  tfe
+      addZeroImm, //  dlc
+      addZeroImm  //  swz
+    }};
+}
+
 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
                                                  const MachineInstr &MI,
                                                  int OpIdx) const {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index d7dc8de3677d..94019ddf8ff5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -180,6 +180,9 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectDS1Addr1Offset(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectMUBUFAddr64(MachineOperand &Root) const;
+
   void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx = -1) const;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index bf4a3a254c12..eebfbee8a12e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; End to end tests for scalar vs. vector boolean legalization strategies.
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
index 1afa71ec4a30..e600aa0e1805 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64  -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=WAVE32 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
index b69776bb7b39..636b1d2dda69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
index 0b82dd159d97..020e171d3fd5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
 
 # FIXME: Ideally this would fail to select with ieee mode enabled.
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
index b5d9c8851bff..d6ac32e41543 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
index 2f7319c57ac4..1f4decb7826a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s  | FileCheck -check-prefix=GFX7 %s
 
 # FIXME: Ideally this would fail to select with ieee mode enabled.
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
index eb4a5484d1ad..e6736f2d7147 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
index 17da10515a36..b4ac02a72f61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0  -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0  -o - %s | FileCheck -check-prefixes=GCN %s
 # XUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=2 -pass-remarks-missed='gisel*' -o /dev/null  %s 2>&1 | FileCheck -check-prefixes=ERR %s
 
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-inttoptr.mir
index 1920b6b9f3bb..6321080d6bec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-inttoptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-inttoptr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index 2e1a2ea089eb..ed7007fe5818 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 
 ---
@@ -16,14 +17,29 @@ body: |
 
     ; GFX6-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6: [[LOAD:%[0-9]+]]:vgpr_32(s32) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1)
-    ; GFX6: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -56,6 +72,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s16_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ; GFX9-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -88,6 +109,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_p3_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[LOAD]](p3)
     ; GFX9-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -112,14 +138,29 @@ body: |
 
     ; GFX6-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
-    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
-    ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -152,6 +193,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -184,6 +230,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_v4s16_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ; GFX9-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -216,6 +267,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_p1_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ; GFX9-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -248,6 +304,11 @@ body: |
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_p0_seq_cst
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
     ; GFX9-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -272,26 +333,59 @@ body: |
 
     ; GFX6-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 -2048
-    ; GFX6: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX6: [[LOAD:%[0-9]+]]:vgpr_32(s32) = G_LOAD [[GEP]](p1) :: (load seq_cst 4, addrspace 1)
-    ; GFX6: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -328,26 +422,59 @@ body: |
 
     ; GFX6-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX6: liveins: $vgpr0_vgpr1
-    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4095
-    ; GFX6: [[GEP:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX6: [[LOAD:%[0-9]+]]:vgpr_32(s32) = G_LOAD [[GEP]](p1) :: (load seq_cst 4, addrspace 1)
-    ; GFX6: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index 2c070d65565d..3bbc8f0e016d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
@@ -18,11 +20,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_4
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_4
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -56,11 +78,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_2
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_2
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ; GFX7: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_2
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
     ; GFX8-LABEL: name: load_global_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -94,11 +136,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -132,11 +194,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v2s32
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
     ; GFX7-LABEL: name: load_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
-    ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_v2s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -170,11 +252,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v3s32
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[BUFFER_LOAD_DWORDX3_ADDR64_]]
     ; GFX7-LABEL: name: load_global_v3s32
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
-    ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ; GFX7: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1)
+    ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[BUFFER_LOAD_DWORDX3_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_v3s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
     ; GFX8-LABEL: name: load_global_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -208,11 +310,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v4s32
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
     ; GFX7-LABEL: name: load_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
-    ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1)
+    ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_v4s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -246,11 +368,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s64
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ; GFX7-LABEL: name: load_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s64
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -284,11 +416,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v2s64
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ; GFX7-LABEL: name: load_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ; GFX7-FLAT-LABEL: name: load_global_v2s64
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
     ; GFX8-LABEL: name: load_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -322,11 +464,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v2p1
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     ; GFX7-LABEL: name: load_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ; GFX7-FLAT-LABEL: name: load_global_v2p1
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     ; GFX8-LABEL: name: load_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -359,11 +511,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s96
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; GFX7-LABEL: name: load_global_s96
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ; GFX7-FLAT-LABEL: name: load_global_s96
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; GFX8-LABEL: name: load_global_s96
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -396,11 +558,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s128
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; GFX7-LABEL: name: load_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ; GFX7-FLAT-LABEL: name: load_global_s128
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; GFX8-LABEL: name: load_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -433,11 +605,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_p3_from_4
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[LOAD]](p3)
     ; GFX7-LABEL: name: load_global_p3_from_4
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-FLAT-LABEL: name: load_global_p3_from_4
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -471,11 +653,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_p1_from_8
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ; GFX7-LABEL: name: load_global_p1_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-FLAT-LABEL: name: load_global_p1_from_8
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -509,11 +701,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_p999_from_8
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     ; GFX7-LABEL: name: load_global_p999_from_8
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ; GFX7-FLAT-LABEL: name: load_global_p999_from_8
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     ; GFX8-LABEL: name: load_global_p999_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -546,11 +748,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v2p3
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX7-LABEL: name: load_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX7-FLAT-LABEL: name: load_global_v2p3
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX8-LABEL: name: load_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -583,11 +795,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v2s16
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load 4, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ; GFX7-LABEL: name: load_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
     ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ; GFX7-FLAT-LABEL: name: load_global_v2s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     ; GFX8-LABEL: name: load_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -621,11 +843,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v4s16
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load 8, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ; GFX7-LABEL: name: load_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
     ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ; GFX7-FLAT-LABEL: name: load_global_v4s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
     ; GFX8-LABEL: name: load_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -659,11 +891,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v6s16
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vgpr(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
     ; GFX7-LABEL: name: load_global_v6s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ; GFX7-FLAT-LABEL: name: load_global_v6s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load 12, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
     ; GFX8-LABEL: name: load_global_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -696,11 +938,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_v8s16
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
     ; GFX7-LABEL: name: load_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
     ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
+    ; GFX7-FLAT-LABEL: name: load_global_v8s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[LOAD:%[0-9]+]]:vreg_128(<8 x s16>) = G_LOAD [[COPY]](p1) :: (load 16, align 4, addrspace 1)
+    ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<8 x s16>)
     ; GFX8-LABEL: name: load_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -737,21 +989,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2047
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -797,21 +1089,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_2048
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -867,21 +1199,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2047
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -927,21 +1299,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m2048
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -987,21 +1399,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_4095
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1057,21 +1509,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_4096
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1137,21 +1629,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4095
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1207,21 +1739,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m4096
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1277,21 +1849,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_8191
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1357,21 +1969,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_8192
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1437,21 +2089,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8191
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -1517,21 +2209,61 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1
 
+    ; GFX6-LABEL: name: load_global_s32_from_1_gep_m8192
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
-    ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1)
+    ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
+    ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1)
+    ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
index 3e0de2289aaa..e68fda19d493 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index 8486edd6c744..04735c7c5ac9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -1,5 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs  -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s
@@ -15,11 +17,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_s32_to_4
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_4
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -52,11 +74,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_s32_to_2
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_2
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
+    ; GFX7: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_2
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -89,11 +131,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_s32_to_1
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_to_1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
+    ; GFX7: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s32_to_1
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -127,11 +189,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_global_s64
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GFX6: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s64
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -164,11 +236,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
 
+    ; GFX6-LABEL: name: store_global_s96
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX6: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_s96
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s96
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_s96
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -200,11 +282,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
 
+    ; GFX6-LABEL: name: store_global_s128
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_s128
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s128
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_s128
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -237,11 +329,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_global_v2s32
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX6: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v2s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -274,11 +386,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
 
+    ; GFX6-LABEL: name: store_global_v3s32
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX6: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v3s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
-    ; GFX7: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v3s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -311,11 +443,31 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
 
+    ; GFX6-LABEL: name: store_global_v4s32
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v4s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
-    ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX7: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v4s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -349,11 +501,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_v2s16
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; GFX6: G_STORE [[COPY1]](<2 x s16>), [[COPY]](p1) :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v2s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -387,11 +549,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_global_v4s16
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; GFX6: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_v4s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v4s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -425,11 +597,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
 
+    ; GFX6-LABEL: name: store_global_v6s16
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX6: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v6s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v6s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
+    ; GFX7-FLAT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store 12, align 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -461,11 +643,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
 
+    ; GFX6-LABEL: name: store_global_v8s16
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v8s16
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v8s16
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: G_STORE [[COPY1]](<8 x s16>), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -498,11 +690,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
 
+    ; GFX6-LABEL: name: store_global_v2s64
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v2s64
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -536,11 +738,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_global_p1
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY $vgpr2_vgpr3
+    ; GFX6: G_STORE [[COPY1]](p1), [[COPY]](p1) :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_p1
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -574,11 +786,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
 
+    ; GFX6-LABEL: name: store_global_v2p1
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX6: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2p1
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store 16, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v2p1
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
+    ; GFX7-FLAT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store 16, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -611,11 +833,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_p3
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_p3
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -649,11 +881,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_global_v2p3
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
+    ; GFX6: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store 8, addrspace 1)
     ; GFX7-LABEL: name: store_global_v2p3
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_v2p3
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store 8, addrspace 1)
     ; GFX8-LABEL: name: store_global_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -685,11 +927,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_atomic_global_s32
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; GFX6: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store monotonic 4, addrspace 1)
     ; GFX7-LABEL: name: store_atomic_global_s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
     ; GFX7: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store monotonic 4, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_atomic_global_s32
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; GFX7-FLAT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store monotonic 4, addrspace 1)
     ; GFX8-LABEL: name: store_atomic_global_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -722,11 +974,21 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2_vgpr3
 
+    ; GFX6-LABEL: name: store_atomic_global_s64
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GFX6: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store monotonic 8, addrspace 1)
     ; GFX7-LABEL: name: store_atomic_global_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
     ; GFX7: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store monotonic 8, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_atomic_global_s64
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
+    ; GFX7-FLAT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store monotonic 8, addrspace 1)
     ; GFX8-LABEL: name: store_atomic_global_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
@@ -759,6 +1021,26 @@ body: |
   bb.0:
     liveins:  $vgpr0_vgpr1, $vgpr2
 
+    ; GFX6-LABEL: name: store_global_s32_gep_2047
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
+    ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX6: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX6: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
     ; GFX7-LABEL: name: store_global_s32_gep_2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
@@ -766,14 +1048,34 @@ body: |
     ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
     ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GFX7: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
+    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
     ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
     ; GFX7: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
-    ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
-    ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
-    ; GFX7: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
+    ; GFX7: %14:vgpr_32, dead %16:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %14, %subreg.sub1
+    ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
+    ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047
+    ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7-FLAT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
+    ; GFX7-FLAT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7-FLAT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
+    ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_I32_e64_1]], 0, implicit $exec
+    ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_]], %subreg.sub0, %9, %subreg.sub1
+    ; GFX7-FLAT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1)
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index b1e389053da2..4945f7a338b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
 
 ; FIXME: Merge with other test. DS offset folding doesn't work due to
 ; register bank copies, and no return optimization is missing.
@@ -19,30 +19,30 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -67,32 +67,32 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 ad
 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 16
-; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -117,9 +117,26 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out,
 }
 
 define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+; GCN-LABEL: lds_atomic_dec_noret_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    ds_dec_rtn_u32 v0, v1, v0
+; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: lds_atomic_dec_noret_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    ds_dec_rtn_u32 v0, v0, v1
+; GFX9-NEXT:    s_endpgm
 ; CI-LABEL: lds_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -129,21 +146,13 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) noun
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    ds_dec_rtn_u32 v0, v1, v0
 ; VI-NEXT:    s_endpgm
-; GFX9-LABEL: lds_atomic_dec_noret_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    ds_dec_rtn_u32 v0, v0, v1
-; GFX9-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
@@ -151,7 +160,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) noun
 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -162,7 +171,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -187,7 +196,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %pt
 define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -201,7 +210,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -233,7 +242,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32
 define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -249,7 +258,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %o
 ;
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -286,7 +295,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %o
 define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -296,7 +305,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) n
 ;
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -319,7 +328,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) n
 define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -331,7 +340,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)*
 ;
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -362,7 +371,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -387,7 +396,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -445,7 +454,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -465,7 +474,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -508,7 +517,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
 define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -522,7 +531,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 {
 ;
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -554,7 +563,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 {
 define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr) #0 {
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -570,7 +579,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr)
 ;
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -607,7 +616,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr)
 define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind {
 ; CI-LABEL: flat_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -617,7 +626,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind {
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -640,7 +649,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind {
 define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) nounwind {
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -652,7 +661,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) nounwind
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -683,7 +692,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -708,7 +717,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -766,7 +775,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -786,7 +795,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -829,7 +838,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
 define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -844,7 +853,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 ;
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -878,7 +887,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 {
 define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #0 {
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -895,7 +904,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr)
 ;
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -934,7 +943,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr)
 define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -945,7 +954,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,7 +979,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind {
 define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind {
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -983,7 +992,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind
 ;
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1016,7 +1025,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1042,7 +1051,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1102,7 +1111,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1123,7 +1132,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1171,7 +1180,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32
 ; CI-LABEL: atomic_dec_shl_base_lds_0:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mul_lo_u32 v5, 4, v0
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v6, 9
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
@@ -1190,7 +1199,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32
 ; VI-LABEL: atomic_dec_shl_base_lds_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mul_lo_u32 v5, 4, v0
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0, v5
@@ -1234,32 +1243,32 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32
 define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1285,34 +1294,34 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 ad
 define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 32
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 32
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1340,7 +1349,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out,
 define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
@@ -1351,7 +1360,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) noun
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
@@ -1375,7 +1384,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) noun
 define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
@@ -1387,7 +1396,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
 ;
 ; VI-LABEL: lds_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
@@ -1414,7 +1423,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %pt
 define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1429,7 +1438,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 ;
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1463,7 +1472,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64
 define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1480,7 +1489,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 ;
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1519,7 +1528,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %o
 define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1530,7 +1539,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 ;
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1555,7 +1564,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) n
 define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1568,7 +1577,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
 ;
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1601,7 +1610,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1627,7 +1636,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1687,7 +1696,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1708,7 +1717,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1756,7 +1765,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; CI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mul_lo_u32 v7, 8, v0
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 9
 ; CI-NEXT:    v_add_i32_e32 v7, vcc, 0, v7
@@ -1776,7 +1785,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; VI-LABEL: atomic_dec_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mul_lo_u32 v7, 8, v0
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 0, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index 454034b0a428..07f45eaf2b8a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; FIXME: Merge with other test. DS offset folding doesn't work due to
 ; register bank copies, and no return optimization is missing.
@@ -20,44 +20,44 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: lds_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    ds_inc_rtn_u32 v2, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    ds_inc_rtn_u32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -71,47 +71,47 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 ad
 define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 16
-; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    s_add_u32 s2, s2, 16
+; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_add_u32 s2, s2, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: lds_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    ds_inc_rtn_u32 v2, v0, v1
+; GFX9-NEXT:    s_add_u32 s2, s2, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    ds_inc_rtn_u32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
@@ -124,7 +124,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out,
 define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -134,7 +134,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) noun
 ;
 ; VI-LABEL: lds_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -144,7 +144,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) noun
 ;
 ; GFX9-LABEL: lds_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -157,7 +157,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) noun
 define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -168,7 +168,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
 ;
 ; VI-LABEL: lds_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -179,7 +179,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
 ;
 ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s0, s0, 16
@@ -194,7 +194,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %pt
 define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -208,7 +208,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
 ;
 ; VI-LABEL: global_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -222,7 +222,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -241,7 +241,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32
 define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -257,7 +257,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
 ;
 ; VI-LABEL: global_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -273,7 +273,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s2, s2, 16
@@ -293,75 +293,31 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %o
 }
 
 define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-; CI-LABEL: global_atomic_inc_noret_i32:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: global_atomic_inc_noret_i32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: global_atomic_inc_noret_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: global_atomic_inc_noret_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; GCN-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
-; CI-LABEL: global_atomic_inc_noret_i32_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 16
-; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: global_atomic_inc_noret_i32_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 16
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: global_atomic_inc_noret_i32_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s0, s0, 16
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
@@ -373,7 +329,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -398,7 +354,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -421,7 +377,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
 ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, 4, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, 4, v0
@@ -456,7 +412,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -476,7 +432,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -494,7 +450,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
 ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, 4, v1
@@ -522,7 +478,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mul_lo_u32 v5, 4, v0
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v6, 9
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
@@ -541,7 +497,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ; VI-LABEL: atomic_inc_shl_base_lds_0_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mul_lo_u32 v5, 4, v0
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v6, 9
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, 0, v5
@@ -560,7 +516,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mul_lo_u32 v1, 4, v0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 9
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0, v1
@@ -586,47 +542,47 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out,
 define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: lds_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -638,50 +594,50 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 ad
 define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: lds_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 32
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v3, s3
+; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 32
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: lds_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 32
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    s_add_u32 s2, s2, 32
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_endpgm
@@ -694,7 +650,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out,
 define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
@@ -705,7 +661,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) noun
 ;
 ; VI-LABEL: lds_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
@@ -716,7 +672,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) noun
 ;
 ; GFX9-LABEL: lds_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -730,7 +686,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) noun
 define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; CI-LABEL: lds_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dword s0, s[0:1], 0x9
+; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 m0, -1
@@ -742,7 +698,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 ;
 ; VI-LABEL: lds_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_mov_b32 m0, -1
@@ -754,7 +710,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 ;
 ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -770,7 +726,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %pt
 define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -785,7 +741,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ;
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -800,7 +756,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -820,7 +776,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64
 define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    v_mov_b32_e32 v3, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -837,7 +793,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ;
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -854,7 +810,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -875,81 +831,33 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %o
 }
 
 define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-; CI-LABEL: global_atomic_inc_noret_i64:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: global_atomic_inc_noret_i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: global_atomic_inc_noret_i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: global_atomic_inc_noret_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
-; CI-LABEL: global_atomic_inc_noret_i64_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 32
-; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: global_atomic_inc_noret_i64_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 32
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 32
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: global_atomic_inc_noret_i64_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s0, s0, 32
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -961,7 +869,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -987,7 +895,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1010,7 +918,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
 ;
 ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, 8, v1
@@ -1047,7 +955,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1068,7 +976,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1086,7 +994,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 ;
 ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, 8, v1
@@ -1111,100 +1019,40 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 {
-; CI-LABEL: flat_atomic_inc_ret_i32:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v2
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_ret_i32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_ret_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dword v[0:1], v2
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_ret_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32* %out
   ret void
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 {
-; CI-LABEL: flat_atomic_inc_ret_i32_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s2, s2, 16
-; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dword v[0:1], v2
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_ret_i32_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s2, s2, 16
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s2, 16
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dword v[0:1], v2
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_ret_i32_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s2, s2, 16
+; GCN-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_atomic_inc v2, v[0:1], v2 glc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i32, i32* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32* %out
@@ -1212,75 +1060,31 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr)
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind {
-; CI-LABEL: flat_atomic_inc_noret_i32:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_noret_i32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_noret_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_noret_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; GCN-NEXT:    s_endpgm
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind {
-; CI-LABEL: flat_atomic_inc_noret_i32_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 16
-; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_noret_i32_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 16
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 16
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_noret_i32_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s0, s0, 16
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i32, i32* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
@@ -1292,7 +1096,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1317,7 +1121,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1340,7 +1144,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, 4, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, 4, v0
@@ -1375,7 +1179,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1395,7 +1199,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 4, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 4, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1413,7 +1217,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, 4, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, 4, v1
@@ -1441,7 +1245,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    v_mul_lo_u32 v7, 8, v0
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 9
 ; CI-NEXT:    v_add_i32_e32 v7, vcc, 0, v7
@@ -1461,7 +1265,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; VI-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    v_mul_lo_u32 v7, 8, v0
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, 9
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 0, v7
@@ -1481,7 +1285,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_mul_lo_u32 v3, 8, v0
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 9
 ; GFX9-NEXT:    v_add_u32_e32 v4, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0, v3
@@ -1506,106 +1310,42 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out,
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 {
-; CI-LABEL: flat_atomic_inc_ret_i64:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_ret_i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_ret_i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_ret_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64* %out
   ret void
 }
 
 define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 {
-; CI-LABEL: flat_atomic_inc_ret_i64_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s2, s2, 32
-; CI-NEXT:    s_addc_u32 s3, s3, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_ret_i64_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s2, s2, 32
-; VI-NEXT:    s_addc_u32 s3, s3, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s2, 32
-; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_ret_i64_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s2, s2, 32
+; GCN-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64* %out
@@ -1613,81 +1353,33 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr)
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind {
-; CI-LABEL: flat_atomic_inc_noret_i64:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_noret_i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_noret_i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_noret_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind {
-; CI-LABEL: flat_atomic_inc_noret_i64_offset:
-; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; CI-NEXT:    v_mov_b32_e32 v2, 42
-; CI-NEXT:    v_mov_b32_e32 v3, 0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_u32 s0, s0, 32
-; CI-NEXT:    s_addc_u32 s1, s1, 0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; CI-NEXT:    s_endpgm
-;
-; VI-LABEL: flat_atomic_inc_noret_i64_offset:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 42
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_u32 s0, s0, 32
-; VI-NEXT:    s_addc_u32 s1, s1, 0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; VI-NEXT:    s_endpgm
-;
-; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 42
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s0, 32
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
-; GFX9-NEXT:    s_endpgm
+; GCN-LABEL: flat_atomic_inc_noret_i64_offset:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 42
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_u32 s0, s0, 32
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN-NEXT:    s_endpgm
   %gep = getelementptr i64, i64* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
@@ -1699,7 +1391,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1725,7 +1417,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v4, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1748,7 +1440,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
 ;
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, 8, v1
@@ -1785,7 +1477,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; CI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; CI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; CI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; CI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
@@ -1806,7 +1498,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; VI-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, 8, v1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    v_mul_hi_u32 v3, 8, v0
 ; VI-NEXT:    v_mul_lo_u32 v0, 8, v0
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
@@ -1824,7 +1516,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 ;
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, 0, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, 8, v1
@@ -1851,18 +1543,18 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
 define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 {
 ; CI-LABEL: nocse_lds_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s6
-; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s4
 ; CI-NEXT:    ds_inc_rtn_u32 v4, v3, v2
 ; CI-NEXT:    ds_inc_rtn_u32 v5, v3, v2
-; CI-NEXT:    v_mov_b32_e32 v2, s4
-; CI-NEXT:    v_mov_b32_e32 v3, s5
-; CI-NEXT:    v_mov_b32_e32 v1, s7
+; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v2, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 ; CI-NEXT:    flat_store_dword v[2:3], v4
 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1871,18 +1563,18 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
 ;
 ; VI-LABEL: nocse_lds_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    ds_inc_rtn_u32 v4, v3, v2
 ; VI-NEXT:    ds_inc_rtn_u32 v5, v3, v2
-; VI-NEXT:    v_mov_b32_e32 v2, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NEXT:    flat_store_dword v[2:3], v4
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1891,17 +1583,17 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
 ;
 ; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    ds_inc_rtn_u32 v4, v1, v0
 ; GFX9-NEXT:    ds_inc_rtn_u32 v5, v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll
index 8287a60a069f..1b09c6251912 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll
@@ -1,4 +1,4 @@
 ; XUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,SI,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll
-; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll
-; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll
+; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll
index c755c37cad46..7aea170ed1ef 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll
@@ -1,4 +1,4 @@
 ; XUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll
-; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll
-; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll
+; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %S/../llvm.amdgcn.ds.append.ll

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
index 55929202b1c2..f4ede38b26aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,HSA,ALL %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=CO-V2,OS-MESA3D,MESA,ALL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-unknown -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=OS-UNKNOWN,MESA,ALL %s
 
 ; ALL-LABEL: {{^}}test:
 ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 946a138bee4f..aed978899565 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mattr=-code-object-v3 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global,-code-object-v3 -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=-code-object-v3 -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0


        


More information about the llvm-commits mailing list