[llvm] r330337 - [AMDGPU] Do not only rely on BB number when finding bottom loop

Mark Searles via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 19 08:42:30 PDT 2018


Author: msearles
Date: Thu Apr 19 08:42:30 2018
New Revision: 330337

URL: http://llvm.org/viewvc/llvm-project?rev=330337&view=rev
Log:
[AMDGPU] Do not only rely on BB number when finding bottom loop

We should also check that the "bottom" basic block of a loopis a successor of the "header" basic block, otherwise we don't propagate the information correctly when the CFG is complex. This fixes an important rendering problem with Wolfsentein 2, because of one vector-memory wait was missing.

Differential Revision: https://reviews.llvm.org/D43831

Added:
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=330337&r1=330336&r2=330337&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Thu Apr 19 08:42:30 2018
@@ -367,7 +367,7 @@ private:
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
       BlockWaitcntBracketsMap;
 
-  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
@@ -403,7 +403,8 @@ public:
   void updateEventWaitCntAfter(MachineInstr &Inst,
                                BlockWaitcntBrackets *ScoreBrackets);
   void mergeInputScoreBrackets(MachineBasicBlock &Block);
-  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
+  unsigned countNumBottomBlocks(const MachineLoop *Loop);
   void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
   void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
   bool isWaitcntStronger(unsigned LHS, unsigned RHS);
@@ -1568,15 +1569,29 @@ void SIInsertWaitcnts::mergeInputScoreBr
   }
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
+/// Return true if the given basic block is a "bottom" block of a loop. This
+/// differs from MachineLoop::getBottomBlock in that it works even if the loop
+/// is discontiguous. This also handles multiple back-edges for the same
+/// "header" block of a loop.
+bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
+                                    const MachineBasicBlock *Block) {
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Count the number of "bottom" basic blocks of a loop.
+unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
+  unsigned Count = 0;
+  for (MachineBasicBlock *MBB : Loop->blocks()) {
+    if (MBB->isSuccessor(Loop->getHeader())) {
+      Count++;
+    }
+  }
+  return Count;
 }
 
 // Generate s_waitcnt instructions where needed.
@@ -1685,7 +1700,7 @@ void SIInsertWaitcnts::insertWaitcntInBl
 
   // Check if we need to force convergence at loop footer.
   MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
-  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
     LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
     WaitcntData->print();
     DEBUG(dbgs() << '\n';);
@@ -1773,6 +1788,7 @@ bool SIInsertWaitcnts::runOnMachineFunct
   TrackedWaitcntSet.clear();
   BlockVisitedSet.clear();
   VCCZBugHandledSet.clear();
+  LoopWaitcntDataMap.clear();
 
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
@@ -1799,21 +1815,30 @@ bool SIInsertWaitcnts::runOnMachineFunct
     // If we are walking into the block from before the loop, then guarantee
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
-    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (!BlockWaitcntProcessedSet.count(&MBB))) {
-      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
-      DEBUG(dbgs() << "set-revisit: Block"
-                   << ContainingLoop->getHeader()->getNumber() << '\n';);
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
+      unsigned Count = countNumBottomBlocks(ContainingLoop);
+
+      // If the loop has multiple back-edges, and so more than one "bottom"
+      // basic block, we have to guarantee a re-walk over every blocks.
+      if ((std::count(BlockWaitcntProcessedSet.begin(),
+                      BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
+        BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+        DEBUG(dbgs() << "set-revisit: Block"
+                     << ContainingLoop->getHeader()->getNumber() << '\n';);
+      }
     }
 
     // Walk over the instructions.
     insertWaitcntInBlock(MF, MBB);
 
     // Flag that waitcnts have been processed at least once.
-    BlockWaitcntProcessedSet.insert(&MBB);
+    BlockWaitcntProcessedSet.push_back(&MBB);
 
-    // See if we want to revisit the loop.
-    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+    // See if we want to revisit the loop. If a loop has multiple back-edges,
+    // we shouldn't revisit the same "bottom" basic block.
+    if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
+        std::count(BlockWaitcntProcessedSet.begin(),
+                   BlockWaitcntProcessedSet.end(), &MBB) == 1) {
       MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
       BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
       if (EntrySB && EntrySB->getRevisitLoop()) {

Added: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir?rev=330337&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir Thu Apr 19 08:42:30 2018
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: waitcnt-back-edge-loop
+# GCN: bb.2
+# GCN: S_WAITCNT 112
+# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+
+---
+name: waitcnt-back-edge-loop
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
+    $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
+    $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec
+    $vgpr3 = V_CNDMASK_B32_e64 -1082130432, 1065353216, killed $sgpr0_sgpr1, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.3:
+    successors: %bb.1
+
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+
+  bb.1:
+    successors: %bb.5, %bb.2
+
+    $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
+    V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.5, implicit killed $vcc
+
+  bb.2:
+    successors: %bb.4, %bb.3
+
+    V_CMP_EQ_U32_e32 9, killed $vgpr5, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    S_CBRANCH_VCCZ %bb.3, implicit killed $vcc
+
+  bb.4:
+    successors: %bb.3, %bb.1
+
+    $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
+    $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec
+    V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec
+    $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
+    $vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec
+    S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
+    S_BRANCH %bb.3
+
+  bb.5:
+
+    $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec
+    EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
+    S_ENDPGM
+...




More information about the llvm-commits mailing list