[llvm] 35e6a9c - AMDGPU: Break read2/write2 search range on a memory fence

Fri Apr 24 12:53:39 PDT 2020

Author: Matt Arsenault
Date: 2020-04-24T15:53:30-04:00
New Revision: 35e6a9c8397e9550382961f2e020987982e9ccd7

URL: https://github.com/llvm/llvm-project/commit/35e6a9c8397e9550382961f2e020987982e9ccd7
DIFF: https://github.com/llvm/llvm-project/commit/35e6a9c8397e9550382961f2e020987982e9ccd7.diff

LOG: AMDGPU: Break read2/write2 search range on a memory fence

This is to fix performance regressions introduced by
86c944d790728891801778b8d98c2c65a83f36a5.

The old search would collect all potentially mergeable instructions in
the entire block. In this case, the same address is written in
multiple places in the block on the other side of a fence. When sorted
by offset, the two unmergeable, identical addresses would be next to
each other and the merge would give up.

Break the search space when we encounter an instruction we won't be
able to merge across. This will keep the identical addresses in
different merge attempts.

This may also improve compile time by reducing the merge list size.

Added: 
    llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll

Modified: 
    llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 31fcdc242235..62a7d9cd0ed2 100644

--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -265,8 +265,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
   void addInstToMergeableList(const CombineInfo &CI,
                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
-  bool collectMergeableInsts(MachineBasicBlock &MBB,
-                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+
+  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
+      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+      std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
 public:
   static char ID;
@@ -1944,31 +1947,38 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
   MergeableInsts.emplace_back(1, CI);
 }
 
-bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
-                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+std::pair<MachineBasicBlock::iterator, bool>
+SILoadStoreOptimizer::collectMergeableInsts(
+    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+    std::list<std::list<CombineInfo>> &MergeableInsts) const {
   bool Modified = false;
-  // Contain the list
-  MemInfoMap Visited;
-  // Contains the list of instructions for which constant offsets are being
-  // promoted to the IMM.
-  SmallPtrSet<MachineInstr *, 4> AnchorList;
 
   // Sort potential mergeable instructions into lists.  One list per base address.
   unsigned Order = 0;
-  for (MachineInstr &MI : MBB.instrs()) {
+  MachineBasicBlock::iterator BlockI = Begin;
+  for (; BlockI != End; ++BlockI) {
+    MachineInstr &MI = *BlockI;
+
     // We run this before checking if an address is mergeable, because it can produce
     // better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
+    // Don't combine if volatile. We also won't be able to merge across this, so
+    // break the search. We can look after this barrier for separate merges.
+    if (MI.hasOrderedMemoryRef()) {
+      LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
+
+      // Search will resume after this instruction in a separate merge list.
+      ++BlockI;
+      break;
+    }
+
     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
     if (InstClass == UNKNOWN)
       continue;
 
-    // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef())
-      continue;
-
     CombineInfo CI;
     CI.setMI(MI, *TII, *STM);
     CI.Order = Order++;
@@ -2008,7 +2018,7 @@ bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
     ++I;
   }
 
-  return Modified;
+  return std::make_pair(BlockI, Modified);
 }
 
 // Scan through looking for adjacent LDS operations with constant offsets from
@@ -2161,15 +2171,33 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   bool Modified = false;
 
+  // Contains the list of instructions for which constant offsets are being
+  // promoted to the IMM. This is tracked for an entire block at time.
+  SmallPtrSet<MachineInstr *, 4> AnchorList;
+  MemInfoMap Visited;
 
   for (MachineBasicBlock &MBB : MF) {
-    std::list<std::list<CombineInfo> > MergeableInsts;
-    // First pass: Collect list of all instructions we know how to merge.
-    Modified |= collectMergeableInsts(MBB, MergeableInsts);
-    do {
-      OptimizeAgain = false;
-      Modified |= optimizeBlock(MergeableInsts);
-    } while (OptimizeAgain);
+    MachineBasicBlock::iterator SectionEnd;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         I = SectionEnd) {
+      bool CollectModified;
+      std::list<std::list<CombineInfo>> MergeableInsts;
+
+      // First pass: Collect list of all instructions we know how to merge in a
+      // subset of the block.
+      std::tie(SectionEnd, CollectModified) =
+          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
+
+      Modified |= CollectModified;
+
+      do {
+        OptimizeAgain = false;
+        Modified |= optimizeBlock(MergeableInsts);
+      } while (OptimizeAgain);
+    }
+
+    Visited.clear();
+    AnchorList.clear();
   }
 
   return Modified;

diff  --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
new file mode 100644
index 000000000000..d69f5ffac781
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+ at lds = internal addrspace(3) global [576 x double] undef, align 16
+
+; Stores to the same address appear multiple places in the same
+; block. When sorted by offset, the merges would fail. We should form
+; two groupings of ds_write2_b64 on either side of the fence.
+define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
+; GCN-LABEL: same_address_fence_merge_write2:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GCN-NEXT:    s_mov_b32 s1, 0x40100000
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_u32_e32 v3, 0x840, v2
+; GCN-NEXT:    v_add_u32_e32 v4, 0xc60, v2
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_barrier
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
+; GCN-NEXT:    s_endpgm
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+  %tmp1 = getelementptr inbounds [576 x double], [576 x double] addrspace(3)* @lds, i32 0, i32 %tmp
+  store double 4.000000e+00, double addrspace(3)* %tmp1, align 8
+  %tmp2 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 66
+  store double 4.000000e+00, double addrspace(3)* %tmp2, align 8
+  %tmp3 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 132
+  store double 4.000000e+00, double addrspace(3)* %tmp3, align 8
+  %tmp4 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 198
+  store double 4.000000e+00, double addrspace(3)* %tmp4, align 8
+  %tmp5 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 264
+  store double 4.000000e+00, double addrspace(3)* %tmp5, align 8
+  %tmp6 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 330
+  store double 4.000000e+00, double addrspace(3)* %tmp6, align 8
+  %tmp7 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 396
+  store double 4.000000e+00, double addrspace(3)* %tmp7, align 8
+  %tmp8 = getelementptr inbounds double, double addrspace(3)* %tmp1, i32 462
+  store double 4.000000e+00, double addrspace(3)* %tmp8, align 8
+  fence syncscope("workgroup") release
+  tail call void @llvm.amdgcn.s.barrier()
+  fence syncscope("workgroup") acquire
+  store double 1.000000e+00, double addrspace(3)* %tmp1, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp2, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp3, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp4, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp5, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp6, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp7, align 8
+  store double 1.000000e+00, double addrspace(3)* %tmp8, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.s.barrier() #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { convergent nounwind }
+
+!0 = !{i32 0, i32 1024}