[llvm] 517171c - [AMDGPU] Extend SILoadStoreOptimizer to handle flat load/stores

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 28 11:27:42 PST 2022


Author: Stanislav Mekhanoshin
Date: 2022-02-28T11:27:30-08:00
New Revision: 517171ce209adcf00214b305b8600587b3de9763

URL: https://github.com/llvm/llvm-project/commit/517171ce209adcf00214b305b8600587b3de9763
DIFF: https://github.com/llvm/llvm-project/commit/517171ce209adcf00214b305b8600587b3de9763.diff

LOG: [AMDGPU] Extend SILoadStoreOptimizer to handle flat load/stores

TODO: merge flat with global promoting to flat.

Differential Revision: https://reviews.llvm.org/D120351

Added: 
    llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir

Modified: 
    llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 588a55b70e978..4fc8d3d631a27 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -82,7 +82,9 @@ enum InstClassEnum {
   GLOBAL_LOAD,
   GLOBAL_LOAD_SADDR,
   GLOBAL_STORE,
-  GLOBAL_STORE_SADDR
+  GLOBAL_STORE_SADDR,
+  FLAT_LOAD,
+  FLAT_STORE
 };
 
 struct AddressRegs {
@@ -244,11 +246,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
                         MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
-  mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired,
-                      MachineBasicBlock::iterator InsertBefore);
+  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                    MachineBasicBlock::iterator InsertBefore);
   MachineBasicBlock::iterator
-  mergeGlobalStorePair(CombineInfo &CI, CombineInfo &Paired,
-                       MachineBasicBlock::iterator InsertBefore);
+  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
+                     MachineBasicBlock::iterator InsertBefore);
 
   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
@@ -323,23 +325,31 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORD:
   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_STORE_DWORD:
     return 1;
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::GLOBAL_LOAD_DWORDX2:
   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX2:
   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX2:
     return 2;
   case AMDGPU::GLOBAL_LOAD_DWORDX3:
   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX3:
     return 3;
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
   case AMDGPU::GLOBAL_LOAD_DWORDX4:
   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORDX4:
     return 4;
   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
     return 8;
@@ -444,6 +454,16 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
     return GLOBAL_STORE_SADDR;
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return FLAT_LOAD;
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return FLAT_STORE;
   }
 }
 
@@ -497,6 +517,16 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+    return AMDGPU::FLAT_LOAD_DWORD;
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
+    return AMDGPU::FLAT_STORE_DWORD;
   }
 }
 
@@ -577,6 +607,14 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::GLOBAL_STORE_DWORDX2:
   case AMDGPU::GLOBAL_STORE_DWORDX3:
   case AMDGPU::GLOBAL_STORE_DWORDX4:
+  case AMDGPU::FLAT_LOAD_DWORD:
+  case AMDGPU::FLAT_LOAD_DWORDX2:
+  case AMDGPU::FLAT_LOAD_DWORDX3:
+  case AMDGPU::FLAT_LOAD_DWORDX4:
+  case AMDGPU::FLAT_STORE_DWORD:
+  case AMDGPU::FLAT_STORE_DWORDX2:
+  case AMDGPU::FLAT_STORE_DWORDX3:
+  case AMDGPU::FLAT_STORE_DWORDX4:
     Result.VAddr = true;
     return Result;
   }
@@ -1449,7 +1487,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
   return New;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
@@ -1492,7 +1530,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair(
   return New;
 }
 
-MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalStorePair(
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
     CombineInfo &CI, CombineInfo &Paired,
     MachineBasicBlock::iterator InsertBefore) {
   MachineBasicBlock *MBB = CI.I->getParent();
@@ -1606,6 +1644,28 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
     case 4:
       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
     }
+  case FLAT_LOAD:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_LOAD_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_LOAD_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_LOAD_DWORDX4;
+    }
+  case FLAT_STORE:
+    switch (Width) {
+    default:
+      return 0;
+    case 2:
+      return AMDGPU::FLAT_STORE_DWORDX2;
+    case 3:
+      return AMDGPU::FLAT_STORE_DWORDX3;
+    case 4:
+      return AMDGPU::FLAT_STORE_DWORDX4;
+    }
   case MIMG:
     assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
            "No overlaps");
@@ -2240,14 +2300,16 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
+    case FLAT_LOAD:
     case GLOBAL_LOAD:
     case GLOBAL_LOAD_SADDR:
-      NewMI = mergeGlobalLoadPair(CI, Paired, Where->I);
+      NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
+    case FLAT_STORE:
     case GLOBAL_STORE:
     case GLOBAL_STORE_SADDR:
-      NewMI = mergeGlobalStorePair(CI, Paired, Where->I);
+      NewMI = mergeFlatStorePair(CI, Paired, Where->I);
       OptimizeListAgain |= CI.Width + Paired.Width < 4;
       break;
     }

diff  --git a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
new file mode 100644
index 0000000000000..53538810dd2e0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
@@ -0,0 +1,480 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            merge_flat_load_dword_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_flat_load_dword_3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dword_3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = FLAT_LOAD_DWORDX3 [[DEF]], 0, 1, implicit $exec, implicit $flat_scr :: (load (s96) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX3_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub2
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2, implicit %3
+...
+
+---
+name:            merge_flat_load_dword_4
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dword_4
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 2, implicit $exec, implicit $flat_scr :: (load (s128) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            merge_flat_load_dword_5
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dword_5
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 3, implicit $exec, implicit $flat_scr :: (load (s128) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[FLAT_LOAD_DWORD]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
+...
+
+---
+name:            merge_flat_load_dword_6
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dword_6
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[COPY]].sub0_sub1
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub2
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = FLAT_LOAD_DWORDX2 [[DEF]], 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX2_]].sub0
+    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
+...
+
+---
+name:            merge_flat_load_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i64* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
+    %2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_flat_load_dwordx3_with_dwordx1
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dwordx3_with_dwordx1
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128_align2 = FLAT_LOAD_DWORDX4 [[DEF]], 12, 0, implicit $exec, implicit $flat_scr :: (load (s128) from `i128* undef`, align 8)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY [[FLAT_LOAD_DWORDX4_]].sub0_sub1_sub2
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `i128* undef`, align 8)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_flat_load_dwordx1_with_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_load_dwordx1_with_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96_align2 = FLAT_LOAD_DWORDX3 [[DEF]], 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `i32* undef`, align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[FLAT_LOAD_DWORDX3_]].sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub1_sub2
+    ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 8)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_flat_load_dword_agpr_with_vgpr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_load_dword_agpr_with_vgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_flat_load_dword_disjoint
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_load_dword_disjoint
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_flat_load_dword_overlap
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_load_dword_overlap
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            no_merge_flat_load_dword_
diff erent_cpol
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_load_dword_
diff erent_cpol
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    S_NOP 0, implicit %1, implicit %2
+...
+
+---
+name:            merge_flat_store_dword_2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dword_2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dword_3
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dword_3
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0, killed [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, killed [[DEF3]], %subreg.sub2
+    ; GCN-NEXT: FLAT_STORE_DWORDX3 [[DEF]], killed [[REG_SEQUENCE1]], 4, 1, implicit $exec, implicit $flat_scr :: (store (s96) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dword_4
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dword_4
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF1]].sub1, %subreg.sub1, [[DEF1]].sub0, %subreg.sub0
+    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[DEF1]].sub2, %subreg.sub2, killed [[REG_SEQUENCE]], %subreg.sub0_sub1
+    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[DEF1]].sub3, %subreg.sub3, killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec, implicit $flat_scr :: (store (s128) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_128 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dword_5
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dword_5
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[DEF3]], %subreg.sub2
+    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[DEF4]], %subreg.sub3
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 3, implicit $exec, implicit $flat_scr :: (store (s128) into `i32* undef`, align 4)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], [[DEF5]], 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:agpr_32 = IMPLICIT_DEF
+    %2:agpr_32 = IMPLICIT_DEF
+    %3:agpr_32 = IMPLICIT_DEF
+    %4:agpr_32 = IMPLICIT_DEF
+    %5:agpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
+    FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dword_6
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dword_6
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF1]], %subreg.sub0, [[DEF2]], %subreg.sub1
+    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE killed [[REG_SEQUENCE]], %subreg.sub0_sub1, [[DEF3]], %subreg.sub2
+    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[DEF4]], %subreg.sub3
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `i32* undef`, align 8)
+    ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[DEF5]], %subreg.sub0, [[DEF6]], %subreg.sub1
+    ; GCN-NEXT: FLAT_STORE_DWORDX2 [[DEF]], killed [[REG_SEQUENCE3]], 20, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i32* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
+    FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dwordx2
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dwordx2
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0_sub1, killed [[DEF2]], %subreg.sub2_sub3
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `i64* undef`, align 4)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_64_align2 = IMPLICIT_DEF
+    %2:vreg_64_align2 = IMPLICIT_DEF
+    FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
+    FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
+...
+
+---
+name:            merge_flat_store_dwordx3_with_dwordx1
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: merge_flat_store_dwordx3_with_dwordx1
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE killed [[DEF1]], %subreg.sub0_sub1_sub2, killed [[DEF2]], %subreg.sub3
+    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s128) into `i64* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vreg_96_align2 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 16)
+    FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            no_merge_flat_store_dword_agpr_with_vgpr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_store_dword_agpr_with_vgpr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:agpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:agpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            no_merge_flat_store_dword_disjoint
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_store_dword_disjoint
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            no_merge_flat_store_dword_overlap
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_store_dword_overlap
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 2)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 2)
+...
+
+---
+name:            no_merge_flat_store_dword_
diff erent_cpol
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_store_dword_
diff erent_cpol
+    ; GCN: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]], killed [[DEF1]], 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: FLAT_STORE_DWORD killed [[DEF]], killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_64_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...
+
+---
+name:            no_merge_flat_store_dword_
diff erent_vaddr
+body:             |
+  bb.0.entry:
+
+    ; GCN-LABEL: name: no_merge_flat_store_dword_
diff erent_vaddr
+    ; GCN: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]].sub0_sub1, killed [[DEF1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[DEF]].sub2_sub3, killed [[DEF2]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    %0:vreg_128_align2 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+...


        


More information about the llvm-commits mailing list