[llvm] r310086 - [AMDGPU] refactor WQM pass in preparation for WWM (NFCI)

Fri Aug 4 11:36:50 PDT 2017

Author: cwabbott
Date: Fri Aug  4 11:36:50 2017
New Revision: 310086

URL: http://llvm.org/viewvc/llvm-project?rev=310086&view=rev
Log:
[AMDGPU] refactor WQM pass in preparation for WWM (NFCI)

Summary:
Right now, the WQM pass conflates two different things when tracking the
Needs of an instruction:

1. Needs can be StateWQM, which is propagated to other instructions, and
means that this instruction (and everything it depends on) must be
calculated in WQM.
2. Needs can be StateExact, which is not propagated to other
instructions, and means that this instruction must not be calculated in
WQM and WQM-ness must not be propagated past this instruction.

This works now because there are only two different states, but in the
future we want to be able to express things like "calculate this in WQM,
but please disable WWM and don't propagate it" (to implement
@llvm.amdgcn.set.inactive). In order to do this, we need to split the
per-instruction Needs field in two: a new Needs field, which can only
contain StateWQM (and in the future, StateWWM) and is propagated to
sources, and a Disables field, which can also contain just StateWQM or
nothing for now.

We keep the per-block tracking the same for now, by translating
Needs/Disables to the old representation with only StateWQM or
StateExact. The other place that needs special handling is when we
emit the state transitions. We could just translate back to the old
representation there as well, which we almost do, but instead of 0 as a
placeholder value for "any state," we explicitly or together all the
states an instruction is allowed to be in. This lets us refactor the
code in preparation for WWM, where we'll need to be able to handle
things like "this instruction must be in Exact or WQM, but not WWM."

Reviewers: arsenm, nhaehnle, tpr

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D35523

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=310086&r1=310085&r2=310086&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Fri Aug  4 11:36:50 2017
@@ -54,6 +54,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -108,6 +109,7 @@ static raw_ostream &operator<<(raw_ostre
 
 struct InstrInfo {
   char Needs = 0;
+  char Disabled = 0;
   char OutNeeds = 0;
 };
 
@@ -142,7 +144,8 @@ private:
 
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
-  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void markInstructionUses(const MachineInstr &MI, char Flag,
+                           std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -220,22 +223,27 @@ void SIWholeQuadMode::markInstruction(Ma
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
 
-  assert(Flag == StateWQM || Flag == StateExact);
+  assert(Flag == StateWQM);
 
-  // Ignore if the instruction is already marked. The typical case is that we
-  // mark an instruction WQM multiple times, but for atomics it can happen that
-  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
-  // letting the atomic run in StateExact is correct as per the relevant specs.
-  if (II.Needs)
+  // Remove any disabled states from the flag. The user that required it gets
+  // an undefined value in the helper lanes. For example, this can happen if
+  // the result of an atomic is used by instruction that requires WQM, where
+  // ignoring the request for WQM is correct as per the relevant specs.
+  Flag &= ~II.Disabled;
+
+  // Ignore if the flag is already encompassed by the existing needs, or we
+  // just disabled everything.
+  if ((II.Needs & Flag) == Flag)
     return;
 
-  II.Needs = Flag;
+  II.Needs |= Flag;
   Worklist.push_back(&MI);
 }
 
-/// Mark all instructions defining the uses in \p MI as WQM.
-void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
-                                  std::vector<WorkItem> &Worklist) {
+/// Mark all instructions defining the uses in \p MI with \p Flag.
+void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
+                                          std::vector<WorkItem> &Worklist) {
+  assert(Flag == StateWQM);
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
@@ -260,7 +268,7 @@ void SIWholeQuadMode::markUsesWQM(const
         if (Value->isPHIDef())
           continue;
 
-        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
                         Worklist);
       }
 
@@ -268,7 +276,7 @@ void SIWholeQuadMode::markUsesWQM(const
     }
 
     for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, StateWQM, Worklist);
+      markInstruction(DefMI, Flag, Worklist);
   }
 }
 
@@ -279,11 +287,18 @@ char SIWholeQuadMode::scanInstructions(M
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
 
-  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
-    MachineBasicBlock &MBB = *BI;
+  // We need to visit the basic blocks in reverse post-order so that we visit
+  // defs before uses, in particular so that we don't accidentally mark an
+  // instruction as needing e.g. WQM before visiting it and realizing it needs
+  // WQM disabled.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = **BI;
+    BlockInfo &BBI = Blocks[&MBB];
 
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
+      InstrInfo &III = Instructions[&MI];
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
@@ -293,7 +308,7 @@ char SIWholeQuadMode::scanInstructions(M
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
-        markUsesWQM(MI, Worklist);
+        markInstructionUses(MI, StateWQM, Worklist);
         GlobalFlags |= StateWQM;
         continue;
       } else if (Opcode == AMDGPU::WQM) {
@@ -302,7 +317,14 @@ char SIWholeQuadMode::scanInstructions(M
         Flags = StateWQM;
         LowerToCopyInstrs.push_back(&MI);
       } else if (TII->isDisableWQM(MI)) {
-        Flags = StateExact;
+        BBI.Needs |= StateExact;
+        if (!(BBI.InNeeds & StateExact)) {
+          BBI.InNeeds |= StateExact;
+          Worklist.push_back(&MBB);
+        }
+        GlobalFlags |= StateExact;
+        III.Disabled = StateWQM;
+        continue;
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
@@ -344,17 +366,19 @@ void SIWholeQuadMode::propagateInstructi
 
   // Control flow-type instructions and stores to temporary memory that are
   // followed by WQM computations must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !II.Needs &&
+  if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
     Instructions[&MI].Needs = StateWQM;
     II.Needs = StateWQM;
   }
 
   // Propagate to block level
-  BI.Needs |= II.Needs;
-  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
-    BI.InNeeds |= II.Needs;
-    Worklist.push_back(MBB);
+  if (II.Needs & StateWQM) {
+    BI.Needs |= StateWQM;
+    if (!(BI.InNeeds & StateWQM)) {
+      BI.InNeeds |= StateWQM;
+      Worklist.push_back(MBB);
+    }
   }
 
   // Propagate backwards within block
@@ -370,10 +394,10 @@ void SIWholeQuadMode::propagateInstructi
   }
 
   // Propagate WQM flag to instruction inputs
-  assert(II.Needs != (StateWQM | StateExact));
+  assert(!(II.Needs & StateExact));
 
-  if (II.Needs == StateWQM)
-    markUsesWQM(MI, Worklist);
+  if (II.Needs != 0)
+    markInstructionUses(MI, II.Needs, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -594,7 +618,7 @@ void SIWholeQuadMode::processBlock(Machi
   MachineBasicBlock::iterator First = IE;
   for (;;) {
     MachineBasicBlock::iterator Next = II;
-    char Needs = 0;
+    char Needs = StateExact | StateWQM;
     char OutNeeds = 0;
 
     if (First == IE)
@@ -606,12 +630,15 @@ void SIWholeQuadMode::processBlock(Machi
       if (requiresCorrectState(MI)) {
         auto III = Instructions.find(&MI);
         if (III != Instructions.end()) {
-          Needs = III->second.Needs;
+          if (III->second.Needs & StateWQM)
+            Needs = StateWQM;
+          else
+            Needs &= ~III->second.Disabled;
           OutNeeds = III->second.OutNeeds;
         }
       }
 
-      if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
+      if (MI.isTerminator() && OutNeeds == StateExact)
         Needs = StateExact;
 
       if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
@@ -624,36 +651,40 @@ void SIWholeQuadMode::processBlock(Machi
         Needs = StateWQM;
       else if (BI.OutNeeds == StateExact)
         Needs = StateExact;
+      else
+        Needs = StateWQM | StateExact;
     }
 
-    if (Needs) {
-      if (Needs != State) {
-        MachineBasicBlock::iterator Before =
-            prepareInsertion(MBB, First, II, Needs == StateWQM,
-                             Needs == StateExact || WQMFromExec);
-
-        if (Needs == StateExact) {
-          if (!WQMFromExec && (OutNeeds & StateWQM))
-            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
-          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
-        } else {
-          assert(WQMFromExec == (SavedWQMReg == 0));
-
-          toWQM(MBB, Before, SavedWQMReg);
-
-          if (SavedWQMReg) {
-            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
-            SavedWQMReg = 0;
-          }
-        }
+    if (!(Needs & State)) {
+      MachineBasicBlock::iterator Before =
+          prepareInsertion(MBB, First, II, Needs == StateWQM,
+                           Needs == StateExact || WQMFromExec);
+
+      if (Needs == StateExact) {
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-        State = Needs;
+        toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+        State = StateExact;
+      } else {
+        assert(Needs == StateWQM);
+        assert(WQMFromExec == (SavedWQMReg == 0));
+
+        toWQM(MBB, Before, SavedWQMReg);
+
+        if (SavedWQMReg) {
+          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+          SavedWQMReg = 0;
+        }
+        State = StateWQM;
       }
 
       First = IE;
     }
 
+    if (Needs != (StateExact | StateWQM))
+      First = IE;
+
     if (II == IE)
       break;
     II = Next;