[llvm] r324440 - [AMDGPU] Suppress redundant waitcnt instrs.

Tue Feb 6 18:21:21 PST 2018

Author: msearles
Date: Tue Feb  6 18:21:21 2018
New Revision: 324440

URL: http://llvm.org/viewvc/llvm-project?rev=324440&view=rev
Log:
[AMDGPU] Suppress redundant waitcnt instrs.

1. Run the memory legalizer prior to the waitcnt pass; keep the policy that the waitcnt pass does not remove any waitcnts within the incoming IR.

2. The waitcnt pass doesn't (yet) track waitcnts that exist prior to the waitcnt pass (it just skips over them); because the waitcnt pass is ignorant of them, it may insert a redundant waitcnt. To avoid this, check the prev instr. If it and the to-be-inserted waitcnt are the same, suppress the insertion. We keep the existing waitcnt under the assumption that whomever, e.g., the memory legalizer, inserted it knows what they were doing.

3. Follow-on work: teach the waitcnt pass to record the pre-existing waitcnts for better waitcnt production.

Differential Revision: https://reviews.llvm.org/D42854

Added:
    llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=324440&r1=324439&r2=324440&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Tue Feb  6 18:21:21 2018
@@ -874,13 +874,13 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
+  addPass(createSIMemoryLegalizerPass());
   if (EnableSIInsertWaitcntsPass)
     addPass(createSIInsertWaitcntsPass());
   else
     addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
-  addPass(createSIMemoryLegalizerPass());
   addPass(createSIDebuggerInsertNopsPass());
   addPass(&BranchRelaxationPassID);
 }

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=324440&r1=324439&r2=324440&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Tue Feb  6 18:21:21 2018
@@ -361,7 +361,7 @@ private:
   AMDGPUAS AMDGPUASI;
 
   DenseSet<MachineBasicBlock *> BlockVisitedSet;
-  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> TrackedWaitcntSet;
   DenseSet<MachineInstr *> VCCZBugHandledSet;
 
   DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
@@ -1114,7 +1114,7 @@ MachineInstr *SIInsertWaitcnts::generate
           BlockWaitcntBrackets *ScoreBracket =
               BlockWaitcntBracketsMap[TBB].get();
           if (!ScoreBracket) {
-            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            assert(!BlockVisitedSet.count(TBB));
             BlockWaitcntBracketsMap[TBB] =
                 llvm::make_unique<BlockWaitcntBrackets>();
             ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
@@ -1132,7 +1132,7 @@ MachineInstr *SIInsertWaitcnts::generate
       } else {
         SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
                                           MI.getDebugLoc());
-        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+        TrackedWaitcntSet.insert(SWaitInst);
       }
 
       const MachineOperand &Op =
@@ -1267,7 +1267,7 @@ void SIInsertWaitcnts::mergeInputScoreBr
   for (MachineBasicBlock *pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[pred].get();
-    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1306,7 +1306,7 @@ void SIInsertWaitcnts::mergeInputScoreBr
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
         BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    bool Visited = BlockVisitedSet.count(Pred);
     if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
       continue;
     }
@@ -1354,7 +1354,7 @@ void SIInsertWaitcnts::mergeInputScoreBr
 
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1468,7 +1468,7 @@ void SIInsertWaitcnts::mergeInputScoreBr
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+    if (!BlockVisitedSet.count(Pred)) {
       continue;
     }
 
@@ -1530,8 +1530,7 @@ void SIInsertWaitcnts::insertWaitcntInBl
     if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
       // TODO: Register the old waitcnt and optimize the following waitcnts.
       // Leaving the previously existing waitcnts is conservatively correct.
-      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
-          CompilerGeneratedWaitcntSet.end())
+      if (!TrackedWaitcntSet.count(&Inst))
         ++Iter;
       else {
         ScoreBrackets->setWaitcnt(&Inst);
@@ -1550,7 +1549,7 @@ void SIInsertWaitcnts::insertWaitcntInBl
 
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
-        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+        (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
           ScoreBrackets->hasPendingSMEM()) {
@@ -1564,11 +1563,29 @@ void SIInsertWaitcnts::insertWaitcntInBl
     MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
 
     if (SWaitInst) {
-      Block.insert(Inst, SWaitInst);
-      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
-        DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                     << "Old Instr: " << Inst << '\n'
-                     << "New Instr: " << *SWaitInst << '\n';);
+      // We don't (yet) track waitcnts that existed prior to the waitcnt
+      // pass (we just skip over them); because the waitcnt pass is ignorant
+      // of them, it may insert a redundant waitcnt. To avoid this, check
+      // the prev instr. If it and the to-be-inserted waitcnt are the
+      // same, keep the prev waitcnt and skip the insertion. We assume that
+      // whomever. e.g., for memory model, inserted the prev waitcnt really
+      // wants it there.
+      bool insertSWaitInst = true;
+      if (Iter != Block.begin()) {
+        MachineInstr *MIPrevInst = &*std::prev(Iter);
+        if (MIPrevInst &&
+            MIPrevInst->getOpcode() == AMDGPU::S_WAITCNT &&
+            MIPrevInst->getOperand(0).getImm() == SWaitInst->getOperand(0).getImm()) {
+          insertSWaitInst = false;
+        }
+      }
+      if (insertSWaitInst) {
+        Block.insert(Inst, SWaitInst);
+        if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+          DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                       << "Old Instr: " << Inst << '\n'
+                       << "New Instr: " << *SWaitInst << '\n';);
+        }
       }
     }
 
@@ -1656,7 +1673,7 @@ void SIInsertWaitcnts::insertWaitcntInBl
         if (!SWaitInst) {
           SWaitInst = Block.getParent()->CreateMachineInstr(
               TII->get(AMDGPU::S_WAITCNT), DebugLoc());
-          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          TrackedWaitcntSet.insert(SWaitInst);
           const MachineOperand &Op = MachineOperand::CreateImm(0);
           SWaitInst->addOperand(MF, Op);
 #if 0 // TODO: Format the debug output
@@ -1712,6 +1729,10 @@ bool SIInsertWaitcnts::runOnMachineFunct
   RegisterEncoding.SGPRL =
       RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
 
+  TrackedWaitcntSet.clear();
+  BlockVisitedSet.clear();
+  VCCZBugHandledSet.clear();
+
   // Walk over the blocks in reverse post-dominator order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
@@ -1738,8 +1759,7 @@ bool SIInsertWaitcnts::runOnMachineFunct
     // at least 1 re-walk over the loop to propagate the information, even if
     // no S_WAITCNT instructions were generated.
     if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
-        (BlockWaitcntProcessedSet.find(&MBB) ==
-         BlockWaitcntProcessedSet.end())) {
+        (!BlockWaitcntProcessedSet.count(&MBB))) {
       BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
       DEBUG(dbgs() << "set-revisit: block"
                    << ContainingLoop->getHeader()->getNumber() << '\n';);

Added: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir?rev=324440&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir Tue Feb  6 18:21:21 2018
@@ -0,0 +1,24 @@
+# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s
+
+# Check that the waitcnt pass does *not* insert a redundant waitcnt instr.
+# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127
+# or S_WAITCNT 3952
+
+...
+# CHECK-LABEL: name: waitcnt-no-redundant
+# CHECK: DS_READ_B64
+# CHECK-NEXT: S_WAITCNT 127
+# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP
+# CHECK-NEXT: S_WAITCNT 3952
+# CHECK-NEXT: BUFFER_WBINVL1_VOL
+
+name: waitcnt-no-redundant
+body: |
+    bb.0:
+    renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec
+    S_WAITCNT 127
+    FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr
+    S_WAITCNT 3952
+    BUFFER_WBINVL1_VOL implicit $exec
+    S_ENDPGM
+...