[llvm] r333556 - [AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks

Wed May 30 08:47:45 PDT 2018

Author: msearles
Date: Wed May 30 08:47:45 2018
New Revision: 333556

URL: http://llvm.org/viewvc/llvm-project?rev=333556&view=rev
Log:
[AMDGPU][Waitcnt] Fix handling of loops with many bottom blocks

In terms of waitcnt insertion/if necessary, the waitcnt pass forces convergence
for a loop. Previously, that kicked if greater than 2 passes over a loop, which
doesn't account for loop with many bottom blocks. So, increase the threshold to
(n+1), where n is the number of bottom blocks. This gives the pass an
opportunity to consider the contribution of each bottom block, to the overall
loop, before the forced convergence potentially kicks in.

Differential Revision: https://reviews.llvm.org/D47488

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=333556&r1=333555&r2=333556&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Wed May 30 08:47:45 2018
@@ -345,7 +345,7 @@ public:
 
   void incIterCnt() { IterCnt++; }
   void resetIterCnt() { IterCnt = 0; }
-  int32_t getIterCnt() { return IterCnt; }
+  unsigned getIterCnt() { return IterCnt; }
 
   void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
   MachineInstr *getWaitcnt() const { return LfWaitcnt; }
@@ -1205,7 +1205,7 @@ void SIInsertWaitcnts::generateWaitcntIn
           }
           ScoreBracket->setRevisitLoop(true);
           LLVM_DEBUG(dbgs()
-                         << "set-revisit: Block"
+                         << "set-revisit2: Block"
                          << ContainingLoop->getHeader()->getNumber() << '\n';);
         }
       }
@@ -1639,10 +1639,9 @@ void SIInsertWaitcnts::mergeInputScoreBr
   }
 }
 
-/// Return true if the given basic block is a "bottom" block of a loop. This
-/// differs from MachineLoop::getBottomBlock in that it works even if the loop
-/// is discontiguous. This also handles multiple back-edges for the same
-/// "header" block of a loop.
+/// Return true if the given basic block is a "bottom" block of a loop.
+/// This works even if the loop is discontiguous. This also handles
+/// multiple back-edges for the same "header" block of a loop.
 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
                                     const MachineBasicBlock *Block) {
   for (MachineBasicBlock *MBB : Loop->blocks()) {
@@ -1776,11 +1775,12 @@ void SIInsertWaitcnts::insertWaitcntInBl
     LLVM_DEBUG(dbgs() << '\n';);
 
     // The iterative waitcnt insertion algorithm aims for optimal waitcnt
-    // placement and doesn't always guarantee convergence for a loop. Each
-    // loop should take at most 2 iterations for it to converge naturally.
-    // When this max is reached and result doesn't converge, we force
-    // convergence by inserting a s_waitcnt at the end of loop footer.
-    if (WaitcntData->getIterCnt() > 2) {
+    // placement, but doesn't guarantee convergence for a loop. Each
+    // loop should take at most (n+1) iterations for it to converge naturally,
+    // where n is the number of bottom blocks. If this threshold is reached and
+    // the result hasn't converged, then we force convergence by inserting
+    // a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
       // To ensure convergence, need to make wait events at loop footer be no
       // more than those from the previous iteration.
       // As a simplification, instead of tracking individual scores and
@@ -1792,16 +1792,16 @@ void SIInsertWaitcnts::insertWaitcntInBl
         if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
           ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
           HasPending = true;
+          break;
         }
       }
 
       if (HasPending) {
         if (!SWaitInst) {
-          SWaitInst = Block.getParent()->CreateMachineInstr(
-              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
+                              DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                              .addImm(0);
           TrackedWaitcntSet.insert(SWaitInst);
-          const MachineOperand &Op = MachineOperand::CreateImm(0);
-          SWaitInst->addOperand(MF, Op);
 #if 0 // TODO: Format the debug output
           OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
           OutputTransformAdd(SWaitInst, context);
@@ -1898,7 +1898,7 @@ bool SIInsertWaitcnts::runOnMachineFunct
       if ((std::count(BlockWaitcntProcessedSet.begin(),
                       BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
         BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-        LLVM_DEBUG(dbgs() << "set-revisit: Block"
+        LLVM_DEBUG(dbgs() << "set-revisit1: Block"
                           << ContainingLoop->getHeader()->getNumber() << '\n';);
       }
     }
@@ -1906,7 +1906,7 @@ bool SIInsertWaitcnts::runOnMachineFunct
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
-    // Flag that waitcnts have been processed at least once.
+    // Record that waitcnts have been processed at least once for this block.
     BlockWaitcntProcessedSet.push_back(&MBB);
 
     // See if we want to revisit the loop. If a loop has multiple back-edges,
@@ -2004,8 +2004,12 @@ bool SIInsertWaitcnts::runOnMachineFunct
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
     MachineBasicBlock &EntryBB = MF.front();
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
+    auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(),
+                             DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+                             .addImm(0);
+
+    LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+               << "New Instr: " << *SWaitInst << '\n');
 
     Modified = true;
   }

Modified: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir?rev=333556&r1=333555&r2=333556&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir Wed May 30 08:47:45 2018
@@ -57,3 +57,37 @@ body:             |
     EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
     S_ENDPGM
 ...
+---
+
+# GCN-LABEL: name: waitcnt-multiple-back-edges{{$}}
+# GCN: bb.0:
+# GCN: S_WAITCNT 0
+# GCN-NEXT: S_BRANCH %bb.2
+
+name: waitcnt-multiple-back-edges
+body: |
+  bb.0:
+    S_BRANCH %bb.2
+
+  bb.1:
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_CBRANCH_VCCZ %bb.1, implicit $vcc
+
+  bb.3:
+    S_CBRANCH_VCCNZ %bb.5, implicit $vcc
+
+  bb.4:
+    BUFFER_ATOMIC_ADD_OFFSET renamable $vgpr0, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 0, 4, 0, implicit $exec
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.6
+
+  bb.5:
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.6
+
+  bb.6:
+    S_CBRANCH_SCC1 %bb.0, implicit $scc
+    S_ENDPGM
+...