[llvm] 1d178d6 - [Scheduling] Fall back to the fast cluster algorithm if the DAG is too complex

QingShan Zhang via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 1 18:12:06 PST 2020


Author: QingShan Zhang
Date: 2020-11-02T02:11:52Z
New Revision: 1d178d600af77599b398930a640991c9c965a47c

URL: https://github.com/llvm/llvm-project/commit/1d178d600af77599b398930a640991c9c965a47c
DIFF: https://github.com/llvm/llvm-project/commit/1d178d600af77599b398930a640991c9c965a47c.diff

LOG: [Scheduling] Fall back to the fast cluster algorithm if the DAG is too complex

We have added a new load/store cluster algorithm in D85517. However, AArch64 see
some compiling deg with the new algorithm as the IsReachable() is not cheap if
the DAG is complex. O(M+N) See https://bugs.llvm.org/show_bug.cgi?id=47966
So, this patch added a heuristic to switch to old cluster algorithm if the DAG is too complex.

Reviewed By: Owen Anderson

Differential Revision: https://reviews.llvm.org/D90144

Added: 
    

Modified: 
    llvm/lib/CodeGen/MachineScheduler.cpp
    llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index b23913197f02..256628a179ae 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -129,6 +129,15 @@ static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
 static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
+static cl::opt<bool>
+    ForceFastCluster("force-fast-cluster", cl::Hidden,
+                     cl::desc("Switch to fast cluster algorithm with the lost "
+                              "of some fusion opportunities"),
+                     cl::init(false));
+static cl::opt<unsigned>
+    FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
+                         cl::desc("The threshold for fast cluster"),
+                         cl::init(1000));
 
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
@@ -1530,10 +1539,12 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
 protected:
-  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps,
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
                                 ScheduleDAGInstrs *DAG);
   void collectMemOpRecords(std::vector<SUnit> &SUnits,
                            SmallVectorImpl<MemOpInfo> &MemOpRecords);
+  bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+                   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
 };
 
 class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1572,8 +1583,11 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
 // Sorting all the loads/stores first, then for each load/store, checking the
 // following load/store one by one, until reach the first non-dependent one and
 // call target hook to see if they can cluster.
+// If FastCluster is enabled, we assume that, all the loads/stores have been
+// preprocessed and now, they didn't have dependencies on each other.
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<MemOpInfo> MemOpRecords, ScheduleDAGInstrs *DAG) {
+    ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
+    ScheduleDAGInstrs *DAG) {
   // Keep track of the current cluster length and bytes for each SUnit.
   DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
 
@@ -1589,8 +1603,9 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
       // Skip if MemOpb has been clustered already or has dependency with
       // MemOpa.
       if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
-          !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
-          !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))
+          (FastCluster ||
+           (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+            !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
         break;
     if (NextIdx == End)
       continue;
@@ -1685,6 +1700,36 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
   }
 }
 
+bool BaseMemOpClusterMutation::groupMemOps(
+    ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
+  bool FastCluster =
+      ForceFastCluster ||
+      MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
+
+  for (const auto &MemOp : MemOps) {
+    unsigned ChainPredID = DAG->SUnits.size();
+    if (FastCluster) {
+      for (const SDep &Pred : MemOp.SU->Preds) {
+        // We only want to cluster the mem ops that have the same ctrl(non-data)
+        // pred so that they didn't have ctrl dependency for each other. But for
+        // store instrs, we can still cluster them if the pred is load instr.
+        if ((Pred.isCtrl() &&
+             (IsLoad ||
+              (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
+            !Pred.isArtificial()) {
+          ChainPredID = Pred.getSUnit()->NodeNum;
+          break;
+        }
+      }
+    } else
+      ChainPredID = 0;
+
+    Groups[ChainPredID].push_back(MemOp);
+  }
+  return FastCluster;
+}
+
 /// Callback from DAG postProcessing to create cluster edges for loads/stores.
 void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
   // Collect all the clusterable loads/stores
@@ -1694,12 +1739,20 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
   if (MemOpRecords.size() < 2)
     return;
 
-  // Sorting the loads/stores, so that, we can stop the cluster as early as
-  // possible.
-  llvm::sort(MemOpRecords);
+  // Put the loads/stores without dependency into the same group with some
+  // heuristic if the DAG is too complex to avoid compiling time blow up.
+  // Notice that, some fusion pair could be lost with this.
+  DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
+  bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
 
-  // Trying to cluster all the neighboring loads/stores.
-  clusterNeighboringMemOps(MemOpRecords, DAG);
+  for (auto &Group : Groups) {
+    // Sorting the loads/stores, so that, we can stop the cluster as early as
+    // possible.
+    llvm::sort(Group.second);
+
+    // Trying to cluster all the neighboring loads/stores.
+    clusterNeighboringMemOps(Group.second, FastCluster, DAG);
+  }
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index e95321582def..1c093989c395 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -force-fast-cluster -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK-FAST
 
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: stp_i64_scale:%bb.0
@@ -227,6 +228,10 @@ entry:
 ; CHECK:SU(7):   %5:gpr32 = LDRWui %1:gpr64common, 1 ::
 ; CHECK:Predecessors:
 ; CHECK:SU(6): Ord  Latency=1 Memory
+; CHECK-FAST: cluster_with_
diff erent_preds:%bb.0
+; CHECK-FAST-NOT: Cluster ld/st
+; CHECK-FAST:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
+; CHECK-FAST:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
 define i32 @cluster_with_
diff erent_preds(i32* %p, i32* %q) {
 entry:
   store i32 3, i32* %p, align 4


        


More information about the llvm-commits mailing list