[PATCH] D26414: AMDGPU: Enable store clustering

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 8 12:30:46 PST 2016


arsenm created this revision.
arsenm added a subscriber: llvm-commits.
Herald added a reviewer: tstellarAMD.
Herald added subscribers: tony-tye, yaxunl, nhaehnle, wdng, kzhuravl.

Also respect the TII hook for these like the generic code does in case we want a flag later to disable this.


https://reviews.llvm.org/D26414

Files:
  lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
  lib/Target/AMDGPU/AMDGPUInstrInfo.h
  lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
  test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll


Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -207,13 +207,21 @@
 }
 
 ; FUNC-LABEL: {{^}}reorder_global_offsets_addr64_soffset0:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:20{{$}}
 ; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:12{{$}}
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:28{{$}}
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:44{{$}}
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:28{{$}}
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:44{{$}}
+
+; GCN: v_mov_b32
+; GCN: v_mov_b32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64{{$}}
+; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:20{{$}}
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
+; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
 define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -102,7 +102,14 @@
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new ScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+  if (TII->enableClusterLoads())
+    DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+
+  if (TII->enableClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+
   return DAG;
 }
 
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -40,6 +40,7 @@
   explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
 
   bool enableClusterLoads() const override;
+  bool enableClusterStores() const override;
 
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -36,6 +36,10 @@
   return true;
 }
 
+bool AMDGPUInstrInfo::enableClusterStores() const {
+  return true;
+}
+
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
 // be clustered as expected. It should really split into 2 16 store batches.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D26414.77240.patch
Type: text/x-patch
Size: 3739 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20161108/51fd1c58/attachment.bin>


More information about the llvm-commits mailing list