[llvm] r347850 - AMDGPU/InsertWaitcnts: Simplify pending events tracking

Thu Nov 29 03:06:15 PST 2018

Author: nha
Date: Thu Nov 29 03:06:14 2018
New Revision: 347850

URL: http://llvm.org/viewvc/llvm-project?rev=347850&view=rev
Log:
AMDGPU/InsertWaitcnts: Simplify pending events tracking

Summary:
Instead of storing the "score" (last time point) of the various relevant
events, only store whether an event is pending or not.

This is sufficient, because whenever only one event of a count type is
pending, its last time point is naturally the upper bound of all time
points of this count type, and when multiple event types are pending,
the count type has gone out of order and an s_waitcnt to 0 is required
to clear any pending event type (and will then clear all pending event
types for that count type).

This also removes the special handling of GDS_GPR_LOCK and EXP_GPR_LOCK.
I do not understand what this special handling ever attempted to achieve.
It has existed ever since the original port from an internal code base,
so my best guess is that it solved a problem related to EXEC handling in
that internal code base.

Reviewers: msearles, rampitec, scott.linder, kanarayan

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits, hakzsam

Differential Revision: https://reviews.llvm.org/D54228

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=347850&r1=347849&r2=347850&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Thu Nov 29 03:06:14 2018
@@ -13,6 +13,14 @@
 /// Memory reads and writes are issued asynchronously, so we need to insert
 /// S_WAITCNT instructions when we want to access any of their results or
 /// overwrite any register that's used asynchronously.
+///
+/// TODO: This pass currently keeps one timeline per hardware counter. A more
+/// finely-grained approach that keeps one timeline per event type could
+/// sometimes get away with generating weaker s_waitcnt instructions. For
+/// example, when both SMEM and LDS are in flight and we need to wait for
+/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
+/// but the pass will currently generate a conservative lgkmcnt(0) because
+/// multiple event types are in flight.
 //
 //===----------------------------------------------------------------------===//
 
@@ -132,10 +140,13 @@ enum WaitEventType {
   NUM_WAIT_EVENTS,
 };
 
-iterator_range<enum_iterator<WaitEventType>> wait_event_types() {
-  return make_range(enum_iterator<WaitEventType>(VMEM_ACCESS),
-                    enum_iterator<WaitEventType>(NUM_WAIT_EVENTS));
-}
+static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+  (1 << VMEM_ACCESS),
+  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+      (1 << SQ_MESSAGE),
+  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+      (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
+};
 
 // The mapping is:
 //  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
@@ -232,24 +243,12 @@ public:
 
   // Mapping from event to counter.
   InstCounterType eventCounter(WaitEventType E) {
-    switch (E) {
-    case VMEM_ACCESS:
+    if (E == VMEM_ACCESS)
       return VM_CNT;
-    case LDS_ACCESS:
-    case GDS_ACCESS:
-    case SQ_MESSAGE:
-    case SMEM_ACCESS:
+    if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
       return LGKM_CNT;
-    case EXP_GPR_LOCK:
-    case GDS_GPR_LOCK:
-    case VMW_GPR_LOCK:
-    case EXP_POS_ACCESS:
-    case EXP_PARAM_ACCESS:
-      return EXP_CNT;
-    default:
-      llvm_unreachable("unhandled event type");
-    }
-    return NUM_INST_CNTS;
+    assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
+    return EXP_CNT;
   }
 
   void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
@@ -278,7 +277,8 @@ public:
   void clear() {
     memset(ScoreLBs, 0, sizeof(ScoreLBs));
     memset(ScoreUBs, 0, sizeof(ScoreUBs));
-    memset(EventUBs, 0, sizeof(EventUBs));
+    PendingEvents = 0;
+    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
     for (auto T : inst_counter_types())
       memset(VgprScores[T], 0, sizeof(VgprScores[T]));
     memset(SgprScores, 0, sizeof(SgprScores));
@@ -296,15 +296,9 @@ public:
   void setWaitAtBeginning() { WaitAtBeginning = true; }
   void clearWaitAtBeginning() { WaitAtBeginning = false; }
   bool getWaitAtBeginning() const { return WaitAtBeginning; }
-  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
   int32_t getMaxVGPR() const { return VgprUB; }
   int32_t getMaxSGPR() const { return SgprUB; }
 
-  int32_t getEventUB(enum WaitEventType W) const {
-    assert(W < NUM_WAIT_EVENTS);
-    return EventUBs[W];
-  }
-
   bool counterOutOfOrder(InstCounterType T) const;
   bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
   bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
@@ -316,11 +310,12 @@ public:
                      const MachineRegisterInfo *MRI, WaitEventType E,
                      MachineInstr &MI);
 
-  bool hasPendingSMEM() const {
-    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents & (1 << E);
   }
 
+  void mergePendingEvents(const BlockWaitcntBrackets &Other);
+
   bool hasPendingFlat() const {
     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -343,11 +338,6 @@ public:
   void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
   int32_t getPostOrder() const { return PostOrder; }
 
-  bool mixedExpTypes() const { return MixedExpTypes; }
-  void setMixedExpTypes(bool MixedExpTypesIn) {
-    MixedExpTypes = MixedExpTypesIn;
-  }
-
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -355,11 +345,11 @@ private:
   const GCNSubtarget *ST = nullptr;
   bool WaitAtBeginning = false;
   bool RevisitLoop = false;
-  bool MixedExpTypes = false;
   int32_t PostOrder = 0;
   int32_t ScoreLBs[NUM_INST_CNTS] = {0};
   int32_t ScoreUBs[NUM_INST_CNTS] = {0};
-  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  uint32_t PendingEvents = 0;
+  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
   // Remember the last flat memory operation.
   int32_t LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
@@ -559,19 +549,17 @@ void BlockWaitcntBrackets::updateByEvent
   const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
   int32_t CurrScore = getScoreUB(T) + 1;
-  // EventUB and ScoreUB need to be update regardless if this event changes
-  // the score of a register or not.
+  // PendingEvents and ScoreUB need to be update regardless if this event
+  // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  EventUBs[E] = CurrScore;
+  if (!hasPendingEvent(E)) {
+    if (PendingEvents & WaitEventMaskForInst[T])
+      MixedPendingEvents[T] = true;
+    PendingEvents |= 1 << E;
+  }
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
-    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
-    // is required.
-    if (!MixedExpTypes) {
-      MixedExpTypes = counterOutOfOrder(EXP_CNT);
-    }
-
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
     if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
@@ -803,9 +791,6 @@ void BlockWaitcntBrackets::applyWaitcnt(
   applyWaitcnt(VM_CNT, Wait.VmCnt);
   applyWaitcnt(EXP_CNT, Wait.ExpCnt);
   applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
-
-  if (Wait.ExpCnt == 0)
-    setMixedExpTypes(false);
 }
 
 void BlockWaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -818,76 +803,28 @@ void BlockWaitcntBrackets::applyWaitcnt(
     setScoreLB(T, std::max(getScoreLB(T), UB - (int32_t)Count));
   } else {
     setScoreLB(T, UB);
+    MixedPendingEvents[T] = false;
+    PendingEvents &= ~WaitEventMaskForInst[T];
+  }
+}
+
+void BlockWaitcntBrackets::mergePendingEvents(const BlockWaitcntBrackets &Other) {
+  for (auto T : inst_counter_types()) {
+    uint32_t Old = PendingEvents & WaitEventMaskForInst[T];
+    uint32_t New = Other.PendingEvents & WaitEventMaskForInst[T];
+    if (Other.MixedPendingEvents[T] || (Old && New && Old != New))
+      MixedPendingEvents[T] = true;
+    PendingEvents |= New;
   }
 }
 
 // Where there are multiple types of event in the bracket of a counter,
 // the decrement may go out of order.
 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
-  switch (T) {
-  case VM_CNT:
-    return false;
-  case LGKM_CNT: {
-    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      // Scalar memory read always can go out of order.
-      return true;
-    }
-    int NumEventTypes = 0;
-    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
-        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
-      NumEventTypes++;
-    }
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  case EXP_CNT: {
-    // If there has been a mixture of export types, then a waitcnt exp(0) is
-    // required.
-    if (MixedExpTypes)
-      return true;
-    int NumEventTypes = 0;
-    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
-        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
-        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
-      NumEventTypes++;
-    }
-
-    if (NumEventTypes <= 1) {
-      return false;
-    }
-    break;
-  }
-  default:
-    break;
-  }
-  return true;
+  // Scalar memory read always can go out of order.
+  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+    return true;
+  return MixedPendingEvents[T];
 }
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -1023,14 +960,12 @@ void SIInsertWaitcnts::generateWaitcntIn
     if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS), Wait);
-      ScoreBrackets->determineWait(
-          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK), Wait);
+      if (ScoreBrackets->hasPendingEvent(EXP_GPR_LOCK) ||
+          ScoreBrackets->hasPendingEvent(EXP_PARAM_ACCESS) ||
+          ScoreBrackets->hasPendingEvent(EXP_POS_ACCESS) ||
+          ScoreBrackets->hasPendingEvent(GDS_GPR_LOCK)) {
+        Wait.ExpCnt = 0;
+      }
     }
 
 #if 0 // TODO: the following code to handle CALL.
@@ -1135,7 +1070,7 @@ void SIInsertWaitcnts::generateWaitcntIn
   if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
     if (ScoreBrackets->getScoreLB(LGKM_CNT) <
             ScoreBrackets->getScoreUB(LGKM_CNT) &&
-        ScoreBrackets->hasPendingSMEM()) {
+        ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
       Wait.LgkmCnt = 0;
     }
   }
@@ -1315,7 +1250,6 @@ void SIInsertWaitcnts::mergeInputScoreBr
   BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
   int32_t MaxPending[NUM_INST_CNTS] = {0};
   int32_t MaxFlat[NUM_INST_CNTS] = {0};
-  bool MixedExpTypes = false;
 
   // For single basic block loops, we need to retain the Block's
   // score bracket to have accurate Pred info. So, make a copy of Block's
@@ -1351,25 +1285,6 @@ void SIInsertWaitcnts::mergeInputScoreBr
           PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
       MaxFlat[T] = std::max(MaxFlat[T], span);
     }
-
-    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
-  }
-
-  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-    bool Visited = BlockVisitedSet.count(Pred);
-    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
-      continue;
-    }
-
-    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
-                  PredScoreBrackets->getScoreLB(EXP_CNT);
-    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   }
 
 #if 0
@@ -1390,8 +1305,6 @@ void SIInsertWaitcnts::mergeInputScoreBr
     ScoreBrackets->setLastFlat(T, MaxFlat[T]);
   }
 
-  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
-
   // Set the register scoreboard.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     if (!BlockVisitedSet.count(Pred)) {
@@ -1434,52 +1347,7 @@ void SIInsertWaitcnts::mergeInputScoreBr
       }
     }
 
-    // Also merge the WaitEvent information.
-    for (auto W : wait_event_types()) {
-      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
-      int PredEventUB = PredScoreBrackets->getEventUB(W);
-      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
-        int NewEventUB =
-            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
-        if (NewEventUB > 0) {
-          ScoreBrackets->setEventUB(
-              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-        }
-      }
-    }
-  }
-
-  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
-  // sequencing predecessors, because changes to EXEC require waitcnts due to
-  // the delayed nature of these operations.
-  for (MachineBasicBlock *Pred : Block.predecessors()) {
-    if (!BlockVisitedSet.count(Pred)) {
-      continue;
-    }
-
-    BlockWaitcntBrackets *PredScoreBrackets =
-        BlockWaitcntBracketsMap[Pred].get();
-
-    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
-    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_gds_ub > 0) {
-        ScoreBrackets->setEventUB(
-            GDS_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
-      }
-    }
-    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
-    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
-      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
-                       PredScoreBrackets->getScoreUB(EXP_CNT);
-      if (new_exp_ub > 0) {
-        ScoreBrackets->setEventUB(
-            EXP_GPR_LOCK,
-            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
-      }
-    }
+    ScoreBrackets->mergePendingEvents(*PredScoreBrackets);
   }
 
   // if a single block loop, update the score brackets. Not needed for other
@@ -1562,7 +1430,7 @@ void SIInsertWaitcnts::insertWaitcntInBl
         (!VCCZBugHandledSet.count(&Inst))) {
       if (ScoreBrackets->getScoreLB(LGKM_CNT) <
               ScoreBrackets->getScoreUB(LGKM_CNT) &&
-          ScoreBrackets->hasPendingSMEM()) {
+          ScoreBrackets->hasPendingEvent(SMEM_ACCESS)) {
         if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
           VCCZBugWorkAround = true;
       }