[llvm] r327583 - [AMDGPU] Waitcnt pass: Modify the waitcnt pass to propagate info in the case of a single basic block loop. mergeInputScoreBrackets() does this for us; update it so that it processes the single bb's score bracket when processing the single bb's preds. It is, after all, a pred of itself, so it's score bracket is needed.

Wed Mar 14 15:04:32 PDT 2018

Author: msearles
Date: Wed Mar 14 15:04:32 2018
New Revision: 327583

URL: http://llvm.org/viewvc/llvm-project?rev=327583&view=rev
Log:
[AMDGPU] Waitcnt pass: Modify the waitcnt pass to propagate info in the case of a single basic block loop. mergeInputScoreBrackets() does this for us; update it so that it processes the single bb's score bracket when processing the single bb's preds. It is, after all, a pred of itself, so it's score bracket is needed.

Differential Revision: https://reviews.llvm.org/D44434

Added:
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=327583&r1=327582&r2=327583&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Wed Mar 14 15:04:32 2018
@@ -115,11 +115,11 @@ enum RegisterMapping {
        (w) = (enum WaitEventType)((w) + 1))
 
 // This is a per-basic-block object that maintains current score brackets
-// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// of each wait counter, and a per-register scoreboard for each wait counter.
 // We also maintain the latest score for every event type that can change the
 // waitcnt in order to know if there are multiple types of events within
 // the brackets. When multiple types of event happen in the bracket,
-// wait-count may get decreased out of order, therefore we need to put in
+// wait count may get decreased out of order, therefore we need to put in
 // "s_waitcnt 0" before use.
 class BlockWaitcntBrackets {
 public:
@@ -690,7 +690,7 @@ unsigned int BlockWaitcntBrackets::updat
       setScoreLB(T, getScoreUB(T));
     } else if (counterOutOfOrder(T)) {
       // Counter can get decremented out-of-order when there
-      // are multiple types event in the brack. Also emit an s_wait counter
+      // are multiple types event in the bracket. Also emit an s_wait counter
       // with a conservative value of 0 for the counter.
       NeedWait = CNT_MASK(T);
       setScoreLB(T, getScoreUB(T));
@@ -1301,27 +1301,37 @@ void SIInsertWaitcnts::updateEventWaitCn
   }
 }
 
+// Merge the score brackets of the Block's predecessors;
+// this merged score bracket is used when adding waitcnts to the Block
 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
   bool MixedExpTypes = false;
 
-  // Clear the score bracket state.
-  ScoreBrackets->clear();
-
-  // Compute the number of pending elements on block entry.
+  // For single basic block loops, we need to retain the Block's
+  // score bracket to have accurate Pred info. So, make a copy of Block's
+  // score bracket, clear() it (which retains several important bits of info),
+  // populate, and then replace en masse. For non-single basic block loops,
+  // just clear Block's current score bracket and repopulate in-place.
+  bool IsSelfPred;
+  std::unique_ptr<BlockWaitcntBrackets> S;
+
+  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
+    != Block.pred_end();
+  if (IsSelfPred) {
+    S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+    ScoreBrackets = S.get();
+  }
 
-  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
-  // need to handle single BBs with backedges to themselves. This means that
-  // they will need to retain and not clear their initial state.
+  ScoreBrackets->clear();
 
   // See if there are any uninitialized predecessors. If so, emit an
   // s_waitcnt 0 at the beginning of the block.
-  for (MachineBasicBlock *pred : Block.predecessors()) {
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.count(pred);
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1550,6 +1560,12 @@ void SIInsertWaitcnts::mergeInputScoreBr
       }
     }
   }
+
+  // if a single block loop, update the score brackets. Not needed for other
+  // blocks, as we did this in-place
+  if (IsSelfPred) {
+    BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
+  }
 }
 
 /// Return the "bottom" block of a loop. This differs from

Added: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir?rev=327583&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir Wed Mar 14 15:04:32 2018
@@ -0,0 +1,26 @@
+# RUN: llc -o - %s -march=amdgcn -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+# Check that the waitcnt propogates info in the case of a single basic block loop
+
+# GCN-LABEL: waitcnt-loop-single-basic-block
+# GCN: bb.0
+# GCN: S_WAITCNT 3952
+# GCN-NEXT: GLOBAL_STORE_DWORD
+# GCN: S_WAITCNT 3953
+# GCN-NEXT: GLOBAL_STORE_DWORD
+
+...
+name: waitcnt-loop-single-basic-block
+body: |
+  bb.0:
+    S_BRANCH %bb.1
+  bb.1:
+    GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, implicit $exec
+    $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, implicit $exec
+    $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, implicit $exec
+    $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, implicit $exec
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+  bb.2:
+    S_ENDPGM
+...