[llvm] [AMDGPU]: Fall back to default mutations when iglp is not applied (PR #93418)

Sun May 26 10:03:24 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

Some regions may have IGLPInstrs but no IGLP mutation opportunities. In these regions we should fall back to the default mutations which may still improve performance. This is especially useful for more selective mutations like iglp_opt(2).

This is a second attempt of this feature, with the first attempt being part of https://github.com/llvm/llvm-project/commit/8f2bd8ae68883592a333f4bdbed9798d66e68630 . The issue with that attempt was that in the postRA scheduler, we didn't restore the saved mutations per region (::schedule is called per region, and ::finalizeSchedule is called per block). So, in the case of multiple regions with IGLP instructions in the same block, for the second+ region, IGroupLP would fall back to an IGroupLPDAGMutation that had a SavedMutations which contained itself.

---
Full diff: https://github.com/llvm/llvm-project/pull/93418.diff


5 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+17-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h (+3-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+5-3) 
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+8-6) 
- (modified) llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir (+20) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 57769fe998d1f..e3f7248507953 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,6 +2337,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
 
   ScheduleDAGMI *DAG;
 
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
+
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -2379,7 +2381,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
+  IGroupLPDAGMutation(
+      AMDGPU::SchedulingPhase Phase,
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
+      : SavedMutations(SavedMutations), Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -2597,6 +2602,13 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     PS.solve();
     return;
   }
+
+  if (!SavedMutations)
+    return;
+
+  // We did not apply a mutation, fall back to SavedMutations
+  for (auto &m : *SavedMutations)
+    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2695,9 +2707,10 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation>
-createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
-  return std::make_unique<IGroupLPDAGMutation>(Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+    AMDGPU::SchedulingPhase Phase,
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index aff7096f26d67..46ef4d702d002 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,8 +20,9 @@ namespace AMDGPU {
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
 } // namespace AMDGPU
 
-std::unique_ptr<ScheduleDAGMutation>
-createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+    AMDGPU::SchedulingPhase Phase,
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 20329dea60275..b9a0cfc5b130d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -471,7 +471,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+  DAG->addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -481,7 +482,8 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
+  DAG->addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
   return DAG;
 }
 
@@ -893,7 +895,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 94d93390d0916..3882ab4cf58bf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
+  DAG.addMutation(createIGroupLPDAGMutation(
+      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,7 +858,8 @@ bool GCNSchedStage::initGCNRegion() {
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
-                       : AMDGPU::SchedulingPhase::PreRAReentry));
+                       : AMDGPU::SchedulingPhase::PreRAReentry,
+        &SavedMutations));
   }
 
   return true;
@@ -1577,15 +1578,16 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
+                                          &SavedMutations));
   }
 
   ScheduleDAGMI::schedule();
-}
 
-void GCNPostScheduleDAGMILive::finalizeSchedule() {
   if (HasIGLPInstrs)
     SavedMutations.swap(Mutations);
+}
 
+void GCNPostScheduleDAGMILive::finalizeSchedule() {
   ScheduleDAGMI::finalizeSchedule();
 }
diff --git a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
index 0d84dc0bdc53e..287e58dfc536e 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
+++ b/llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir
@@ -18,3 +18,23 @@ body:             |
     %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
     %3 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32))
 ...
+---
+# GCN-LABEL: name: cluster_flat_loads_iglp_opt
+# GCN:      FLAT_LOAD_DWORD %0, 0
+# GCN-NEXT: FLAT_LOAD_DWORD %0, 4
+# GCN-NEXT: V_ADD_F32_e64
+name: cluster_flat_loads_iglp_opt
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec
+    %3 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    IGLP_OPT 2
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/93418