[llvm] [RISCV] Change heuristic used for load clustering (PR #75341)

Alex Bradbury via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 13 11:35:17 PST 2023


https://github.com/asb updated https://github.com/llvm/llvm-project/pull/75341

>From 037f49eab2bb4d92629cfb42586d51c9ca1dd473 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 13:32:13 +0000
Subject: [PATCH 1/3] [MachineScheduler] Add option to control reordering for
 store/load clustering

Reordering based on the sort order of the MemOpInfo array was disabled
in <https://reviews.llvm.org/D72706>. However, it's not clear this is
desirable for al targets. It also makes it more difficult to compare the
incremental benefit of enabling load clustering in the selectiondag
scheduler as well was the machinescheduler, as the sdag scheduler does
seem to allow this reordering.

This patch adds a parameter that can control the behaviour on a
per-target basis.
---
 llvm/include/llvm/CodeGen/MachineScheduler.h  | 10 ++++--
 llvm/lib/CodeGen/MachineScheduler.cpp         | 31 ++++++++++++-------
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |  2 +-
 .../CodeGen/RISCV/misched-load-clustering.ll  |  6 ++--
 4 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 9f16cf5d5bc387..47127a6c29603b 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1347,13 +1347,19 @@ ScheduleDAGMILive *createGenericSchedLive(MachineSchedContext *C);
 /// Create a generic scheduler with no vreg liveness or DAG mutation passes.
 ScheduleDAGMI *createGenericSchedPostRA(MachineSchedContext *C);
 
+/// If ReorderWhileClustering is set to true, no attempt will be made to
+/// reduce reordering due to store clustering.
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI);
+                             const TargetRegisterInfo *TRI,
+                             bool ReorderWhileClustering = false);
 
+/// If ReorderWhileClustering is set to true, no attempt will be made to
+/// reduce reordering due to store clustering.
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI);
+                              const TargetRegisterInfo *TRI,
+                              bool ReorderWhileClustering = false);
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 886137d86f87de..554776783eff6f 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1743,11 +1743,14 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   bool IsLoad;
+  bool ReorderWhileClustering;
 
 public:
   BaseMemOpClusterMutation(const TargetInstrInfo *tii,
-                           const TargetRegisterInfo *tri, bool IsLoad)
-      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+                           const TargetRegisterInfo *tri, bool IsLoad,
+                           bool ReorderWhileClustering)
+      : TII(tii), TRI(tri), IsLoad(IsLoad),
+        ReorderWhileClustering(ReorderWhileClustering) {}
 
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
@@ -1763,14 +1766,16 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
 class StoreClusterMutation : public BaseMemOpClusterMutation {
 public:
   StoreClusterMutation(const TargetInstrInfo *tii,
-                       const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, false) {}
+                       const TargetRegisterInfo *tri,
+                       bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, false, ReorderWhileClustering) {}
 };
 
 class LoadClusterMutation : public BaseMemOpClusterMutation {
 public:
-  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
-      : BaseMemOpClusterMutation(tii, tri, true) {}
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri,
+                      bool ReorderWhileClustering)
+      : BaseMemOpClusterMutation(tii, tri, true, ReorderWhileClustering) {}
 };
 
 } // end anonymous namespace
@@ -1779,15 +1784,19 @@ namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
+                             const TargetRegisterInfo *TRI,
+                             bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
+                                  TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
-                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
+                              const TargetRegisterInfo *TRI,
+                              bool ReorderWhileClustering) {
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
+                                  TII, TRI, ReorderWhileClustering)
                             : nullptr;
 }
 
@@ -1840,7 +1849,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
 
     SUnit *SUa = MemOpa.SU;
     SUnit *SUb = MemOpb.SU;
-    if (SUa->NodeNum > SUb->NodeNum)
+    if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
 
     // FIXME: Is this check really required?
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 3abdb6003659fa..ae10a72e889239 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -348,7 +348,7 @@ class RISCVPassConfig : public TargetPassConfig {
     ScheduleDAGMILive *DAG = nullptr;
     if (EnableMISchedLoadClustering) {
       DAG = createGenericSchedLive(C);
-      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+      DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI, true));
     }
     if (ST.hasMacroFusion()) {
       DAG = DAG ? DAG : createGenericSchedLive(C);
diff --git a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
index 4eb969a357a9ee..d046aaf98f5edd 100644
--- a/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
+++ b/llvm/test/CodeGen/RISCV/misched-load-clustering.ll
@@ -23,10 +23,10 @@ define i32 @load_clustering_1(ptr nocapture %p) {
 ; LDCLUSTER: ********** MI Scheduling **********
 ; LDCLUSTER-LABEL: load_clustering_1:%bb.0
 ; LDCLUSTER: *** Final schedule for %bb.0 ***
-; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
-; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
-; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
 ; LDCLUSTER: SU(4): %4:gpr = LW %0:gpr, 4
+; LDCLUSTER: SU(2): %2:gpr = LW %0:gpr, 8
+; LDCLUSTER: SU(1): %1:gpr = LW %0:gpr, 12
+; LDCLUSTER: SU(5): %6:gpr = LW %0:gpr, 16
 entry:
   %arrayidx0 = getelementptr inbounds i32, ptr %p, i32 3
   %val0 = load i32, i32* %arrayidx0

>From ef43091841e891768642ee0cf7db6f6ccef27cc1 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 13:50:55 +0000
Subject: [PATCH 2/3] [RISCV] Change heuristic used for load clustering

Split out from #73789. Clusters if the operations are within a cache
line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does
something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not
sure if that's intentionally set to 512 bytes or if the division is in
error.

Adopts the suggestion from @wangpc-pp to query the cache line size and
use it if available.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1dcff7eb563e20..b8960fd9571c9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2282,9 +2282,13 @@ bool RISCVInstrInfo::shouldClusterMemOps(
     return false;
   }
 
-  // TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
-  // indicate they likely share a cache line.
-  return ClusterSize <= 4;
+  unsigned CacheLineSize =
+      BaseOps1.front()->getParent()->getMF()->getSubtarget().getCacheLineSize();
+  // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
+  CacheLineSize = CacheLineSize ? CacheLineSize : 64;
+  // Cluster if the memory operations are on the same or a neighbouring cache
+  // line.
+  return std::abs(Offset1 - Offset2) < CacheLineSize;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being

>From 2160738ff21e43a6b2047219686e5afef0f5e1b5 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 13 Dec 2023 19:34:46 +0000
Subject: [PATCH 3/3] Tweak heuristic to limit the maximum cluster size

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b8960fd9571c9b..cd98438eed8821 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2287,8 +2287,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
   // Assume a cache line size of 64 bytes if no size is set in RISCVSubtarget.
   CacheLineSize = CacheLineSize ? CacheLineSize : 64;
   // Cluster if the memory operations are on the same or a neighbouring cache
-  // line.
-  return std::abs(Offset1 - Offset2) < CacheLineSize;
+  // line, but limit the maximum ClusterSize to avoid creating too much
+  // additional register pressure.
+  return ClusterSize <= 4 && std::abs(Offset1 - Offset2) < CacheLineSize;
 }
 
 // Set BaseReg (the base register operand), Offset (the byte offset being



More information about the llvm-commits mailing list