[llvm] 33fb23f - [AMDGPU] Merge flat with global in the SILoadStoreOptimizer

Wed Mar 9 10:04:45 PST 2022

Author: Stanislav Mekhanoshin
Date: 2022-03-09T10:04:37-08:00
New Revision: 33fb23f728146cd9a1b8a0bf74c666fb2eeffbb5

URL: https://github.com/llvm/llvm-project/commit/33fb23f728146cd9a1b8a0bf74c666fb2eeffbb5
DIFF: https://github.com/llvm/llvm-project/commit/33fb23f728146cd9a1b8a0bf74c666fb2eeffbb5.diff

LOG: [AMDGPU] Merge flat with global in the SILoadStoreOptimizer

Flat can be merged with flat global since address cast is a no-op.
A combined memory operation needs to be promoted to flat.

Differential Revision: https://reviews.llvm.org/D120431

Added: 
    llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir

Modified: 
    llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 4fc8d3d631a27..6d4e1d2c898b7 100644

--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -79,12 +79,13 @@ enum InstClassEnum {
   MIMG,
   TBUFFER_LOAD,
   TBUFFER_STORE,
-  GLOBAL_LOAD,
   GLOBAL_LOAD_SADDR,
-  GLOBAL_STORE,
   GLOBAL_STORE_SADDR,
   FLAT_LOAD,
-  FLAT_STORE
+  FLAT_STORE,
+  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
+  GLOBAL_STORE // any CombineInfo, they are only ever returned by
+               // getCommonInstClass.
 };
 
 struct AddressRegs {
@@ -275,6 +276,9 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
                                                      const CombineInfo &Paired);
 
+  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
+                                          const CombineInfo &Paired);
+
 public:
   static char ID;
 
@@ -438,7 +442,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
-    return GLOBAL_LOAD;
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return FLAT_LOAD;
   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
@@ -448,22 +456,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX2:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
-    return GLOBAL_STORE;
-  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
-    return GLOBAL_STORE_SADDR;
-  case AMDGPU::FLAT_LOAD_DWORD:
-  case AMDGPU::FLAT_LOAD_DWORDX2:
-  case AMDGPU::FLAT_LOAD_DWORDX3:
-  case AMDGPU::FLAT_LOAD_DWORDX4:
-    return FLAT_LOAD;
   case AMDGPU::FLAT_STORE_DWORD:
   case AMDGPU::FLAT_STORE_DWORDX2:
   case AMDGPU::FLAT_STORE_DWORDX3:
   case AMDGPU::FLAT_STORE_DWORDX4:
     return FLAT_STORE;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return GLOBAL_STORE_SADDR;
   }
 }
 
@@ -501,7 +503,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
-    return AMDGPU::GLOBAL_LOAD_DWORD;
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return AMDGPU::FLAT_LOAD_DWORD;
   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
@@ -511,25 +517,37 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX2:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
-    return AMDGPU::GLOBAL_STORE_DWORD;
-  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
-  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
-    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
-  case AMDGPU::FLAT_LOAD_DWORD:
-  case AMDGPU::FLAT_LOAD_DWORDX2:
-  case AMDGPU::FLAT_LOAD_DWORDX3:
-  case AMDGPU::FLAT_LOAD_DWORDX4:
-    return AMDGPU::FLAT_LOAD_DWORD;
   case AMDGPU::FLAT_STORE_DWORD:
   case AMDGPU::FLAT_STORE_DWORDX2:
   case AMDGPU::FLAT_STORE_DWORDX3:
   case AMDGPU::FLAT_STORE_DWORDX4:
     return AMDGPU::FLAT_STORE_DWORD;
+  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+    return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
   }
 }
 
+// GLOBAL loads and stores are classified as FLAT initially. If both combined
+// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
+// If either or both instructions are non segment specific FLAT the resulting
+// combined operation will be FLAT, potentially promoting one of the GLOBAL
+// operations to FLAT.
+// For other instructions return the original unmodified class.
+InstClassEnum
+SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
+                                         const CombineInfo &Paired) {
+  assert(CI.InstClass == Paired.InstClass);
+
+  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
+      SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
+    return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
+
+  return CI.InstClass;
+}
+
 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   AddressRegs Result;
 
@@ -762,10 +780,15 @@ SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
   // A base pointer for the combined operation is the same as the leading
   // operation's pointer.
   if (Paired < CI)
-    MMOa = MMOb;
+    std::swap(MMOa, MMOb);
+
+  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
+  // If merging FLAT and GLOBAL set address space to FLAT.
+  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
 
   MachineFunction *MF = CI.I->getMF();
-  return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size);
+  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
 }
 
 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
@@ -1576,7 +1599,7 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
                                             const CombineInfo &Paired) {
   const unsigned Width = CI.Width + Paired.Width;
 
-  switch (CI.InstClass) {
+  switch (getCommonInstClass(CI, Paired)) {
   default:
     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
     // FIXME: Handle d16 correctly

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
new file mode 100644
index 0000000000000..28e8db1398efa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
@@ -0,0 +1,312 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            merge_flat_global_load_dword_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_global_load_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `float* undef` + 4, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 4, basealign 4)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_flat_load_dword_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_flat_load_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `float addrspace(1)* undef`)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, basealign 8, addrspace 1)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4, basealign 8)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_flat_load_dword_3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_flat_load_dword_3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = FLAT_LOAD_DWORDX3 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `float* undef`, align 16)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX3_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub2
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 16)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_flat_load_dword_4
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_flat_load_dword_4
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i32 addrspace(1)* undef` + 4, align 4, basealign 8)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 12, basealign 8, addrspace 1)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 16)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            merge_flat_global_load_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_global_load_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `double* undef`, align 8)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `double* undef`)
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_flat_global_load_dwordx3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_global_load_dwordx3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `float* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX4_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`)
+    %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `<3 x i32> addrspace(1)* undef`, addrspace 1)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_global_flat_load_dwordx3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_global_flat_load_dwordx3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i32 addrspace(1)* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX4_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `<3 x i32>* undef`)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_flat_global_load_dword_saddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_global_load_dword_saddr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`)
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF1]], [[DEF]].sub0, 4, 0, implicit $exec :: (load (s64) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 4)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    S_NOP 0, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            no_merge_global_saddr_flat_load_dword
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_global_saddr_flat_load_dword
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF1]], [[DEF]].sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i32* undef` + 4, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 8)
+    S_NOP 0, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            merge_flat_global_store_dword_2
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_flat_global_store_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+...
+
+---
+name:            merge_global_flat_store_dword_2
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_global_flat_store_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32 addrspace(1)* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+...
+
+---
+name:            merge_flat_global_store_dwordx2
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_flat_global_store_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2
+    ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vreg_64_align2 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+...
+
+---
+name:            merge_flat_global_store_dwordx3
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_flat_global_store_dwordx3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1_sub2_sub3
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vreg_96_align2 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `<3 x i32> addrspace(1)* undef`, addrspace 1)
+...
+
+---
+name:            merge_global_flat_store_dwordx2
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_global_flat_store_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub2, [[DEF2]], %subreg.sub0_sub1
+    ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 8)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vreg_64_align2 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`)
+...
+
+---
+name:            merge_global_flat_store_dwordx3
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: merge_global_flat_store_dwordx3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub3, [[DEF2]], %subreg.sub0_sub1_sub2
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `<3 x i32>* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vreg_96_align2 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `<3 x i32>* undef`)
+...
+
+---
+name:            no_merge_flat_global_store_dword_saddr
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: no_merge_flat_global_store_dword_saddr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], [[DEF2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF]].sub0, [[DEF3]], [[DEF1]], 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+...
+
+---
+name:            no_merge_global_saddr_flat_store_dword
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: no_merge_global_saddr_flat_store_dword
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: GLOBAL_STORE_DWORD_SADDR [[DEF]].sub0, [[DEF2]], [[DEF1]], 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], [[DEF3]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:sreg_64_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+...