[llvm] e3ffe72 - [AMDGPU] Cluster shader exports

Thu May 7 03:06:51 PDT 2020

Author: Carl Ritson
Date: 2020-05-07T19:05:38+09:00
New Revision: e3ffe7269b6992a23a76b3148cb930c5b62ded88

URL: https://github.com/llvm/llvm-project/commit/e3ffe7269b6992a23a76b3148cb930c5b62ded88
DIFF: https://github.com/llvm/llvm-project/commit/e3ffe7269b6992a23a76b3148cb930c5b62ded88.diff

LOG: [AMDGPU] Cluster shader exports

Summary:
Add DAG scheduling mutation to cluster export instructions.
This avoids unnecessary waitcnts being added when computation
ends up interspersed with exports.

Reviewers: foad, arsenm, rampitec, nhaehnle

Reviewed By: foad

Subscribers: kzhuravl, jvesely, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D79481

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
new file mode 100644
index 000000000000..42ff12ddda2b

--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -0,0 +1,92 @@
+//===--- AMDGPUExportClusting.cpp - AMDGPU Export Clustering  -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to cluster shader
+///       exports.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUExportClustering.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class ExportClustering : public ScheduleDAGMutation {
+public:
+  ExportClustering() {}
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+static bool isExport(const SUnit &SU) {
+  const MachineInstr *MI = SU.getInstr();
+  return MI->getOpcode() == AMDGPU::EXP ||
+         MI->getOpcode() == AMDGPU::EXP_DONE;
+}
+
+static void buildCluster(ArrayRef<SUnit *> Exports, ScheduleDAGInstrs *DAG) {
+  // Cluster a series of exports. Also copy all dependencies to the first
+  // export to avoid computation being inserted into the chain.
+  SUnit *ChainHead = Exports[0];
+  for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) {
+    SUnit *SUa = Exports[Idx];
+    SUnit *SUb = Exports[Idx + 1];
+    if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+      for (const SDep &Pred : SUb->Preds) {
+        SUnit *PredSU = Pred.getSUnit();
+        if (Pred.isWeak() || isExport(*PredSU))
+          continue;
+        DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial));
+      }
+    }
+  }
+}
+
+void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
+  SmallVector<SmallVector<SUnit *, 8>, 4> ExportChains;
+  DenseMap<unsigned, unsigned> ChainMap;
+
+  // Build chains of exports
+  for (SUnit &SU : DAG->SUnits) {
+    if (!isExport(SU))
+      continue;
+
+    unsigned ChainID = ExportChains.size();
+    for (const SDep &Pred : SU.Preds) {
+      const SUnit &PredSU = *Pred.getSUnit();
+      if (isExport(PredSU) && !Pred.isArtificial()) {
+        ChainID = ChainMap.lookup(PredSU.NodeNum);
+        break;
+      }
+    }
+    ChainMap[SU.NodeNum] = ChainID;
+
+    if (ChainID == ExportChains.size())
+      ExportChains.push_back(SmallVector<SUnit *, 8>());
+
+    auto &Chain = ExportChains[ChainID];
+    Chain.push_back(&SU);
+  }
+
+  // Apply clustering
+  for (auto &Chain : ExportChains)
+    buildCluster(Chain, DAG);
+}
+
+} // end namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() {
+  return std::make_unique<ExportClustering>();
+}
+
+} // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
new file mode 100644
index 000000000000..58491d0671e4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -0,0 +1,15 @@
+//===- AMDGPUExportClustering.h - AMDGPU Export Clustering ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation();
+
+} // namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3db52afff861..b5a2abcd7a82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUExportClustering.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPUMacroFusion.h"
@@ -283,6 +284,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index d9c900bb7446..c273ea89bd91 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 1bbd209c21af..d6d80246a89b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -542,14 +542,13 @@ end:
 
 ; GCN-LABEL: {{^}}test_export_clustering:
 ; GCN-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
 ; GCN-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
 ; GCN-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
 ; GCN-DAG: v_add_f32_e32 [[Z0:v[0-9]+]]
-; GCN-DAG: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
 ; GCN-DAG: v_sub_f32_e32 [[Z1:v[0-9]+]]
-; GCN: s_waitcnt expcnt(0)
-; GCN: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-; GCN: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
+; GCN: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
+; GCN-NEXT: exp param1 [[X]], [[Y]], [[Z1]], [[W1]] done{{$}}
 define amdgpu_kernel void @test_export_clustering(float %x, float %y) #0 {
   %z0 = fadd float %x, %y
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %x, float %y, float %z0, float 0.0, i1 false, i1 false)