[llvm] be8e38c - Correct NumLoads in clustering

Fri Jan 24 12:45:47 PST 2020

Author: Stanislav Mekhanoshin
Date: 2020-01-24T12:45:28-08:00
New Revision: be8e38cbd9785d4f4023b88150d14bd815265eef

URL: https://github.com/llvm/llvm-project/commit/be8e38cbd9785d4f4023b88150d14bd815265eef
DIFF: https://github.com/llvm/llvm-project/commit/be8e38cbd9785d4f4023b88150d14bd815265eef.diff

LOG: Correct NumLoads in clustering

Scheduler sends NumLoads argument into shouldClusterMemOps()
one less the actual cluster length. So for 2 instructions
it will pass just 1. Correct this number.

This is NFC for in tree targets.

Differential Revision: https://reviews.llvm.org/D73292

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetInstrInfo.h
    llvm/lib/CodeGen/MachineScheduler.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 7f654fec7cfb..6b24e1221e7f 100644

--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1276,6 +1276,10 @@ class TargetInstrInfo : public MCInstrInfo {
   /// or
   ///   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   /// to TargetPassConfig::createMachineScheduler() to have an effect.
+  ///
+  /// \p BaseOps1 and \p BaseOps2 are memory operands of two memory operations.
+  /// \p NumLoads is the number of loads that will be in the cluster if this
+  /// hook returns true.
   virtual bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                    ArrayRef<const MachineOperand *> BaseOps2,
                                    unsigned NumLoads) const {

diff  --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 7de1a5f596e9..2bd48ab2a2b4 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1584,7 +1584,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     SUnit *SUb = MemOpRecords[Idx+1].SU;
     if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps,
                                  MemOpRecords[Idx + 1].BaseOps,
-                                 ClusterLength)) {
+                                 ClusterLength + 1)) {
       if (SUa->NodeNum > SUb->NodeNum)
         std::swap(SUa, SUb);
       if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f77c5351f3b7..e11f7f18d978 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2422,7 +2422,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(
     return false;
 
   // Only cluster up to a single pair.
-  if (NumLoads > 1)
+  if (NumLoads > 2)
     return false;
 
   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ac5aac6d5ce..f18f06754a55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -457,7 +457,7 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
-    const unsigned MaxGlobalLoadCluster = 6;
+    const unsigned MaxGlobalLoadCluster = 7;
     if (NumLoads > MaxGlobalLoadCluster)
       return false;
 
@@ -497,7 +497,11 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                          ? MRI.getRegClass(Reg)
                                          : RI.getPhysRegClass(Reg);
 
-  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
+  // FIXME: NumLoads should not be subtracted 1. This is to match behavior
+  // of clusterNeighboringMemOps which was previosly passing cluster length
+  // less 1. LoadClusterThreshold should be tuned instead.
+  return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
+         LoadClusterThreshold;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,