[llvm] 6ddf2a8 - [AMDGPU] Adjust wave priority based on VMEM instructions to avoid duty-cycling.

Wed Apr 27 06:40:07 PDT 2022

Author: Ivan Kosarev
Date: 2022-04-27T14:37:18+01:00
New Revision: 6ddf2a824da97c81d7963c082c62640e8173b5b0

URL: https://github.com/llvm/llvm-project/commit/6ddf2a824da97c81d7963c082c62640e8173b5b0
DIFF: https://github.com/llvm/llvm-project/commit/6ddf2a824da97c81d7963c082c62640e8173b5b0.diff

LOG: [AMDGPU] Adjust wave priority based on VMEM instructions to avoid duty-cycling.

As older waves execute long sequences of VALU instructions, this may
prevent younger waves from address calculation and then issuing their
VMEM loads, which in turn leads the VALU unit to idle. This patch tries
to prevent this by temporarily raising the wave's priority.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D124246

Added: 
    llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
    llvm/test/CodeGen/AMDGPU/set-wave-priority.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 892638029ac5e..33f59ad60b3eb 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -331,6 +331,9 @@ extern char &GCNNSAReassignID;
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
new file mode 100644
index 0000000000000..34702ee6623bb
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+  MBBInfo() = default;
+  bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Set wave priority"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+  const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+                false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+  return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+                                                    unsigned priority) const {
+  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+                                                   MBBInfoSet &MBBInfos) {
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    if (!MBBInfos[Pred].MayReachVMEMLoad)
+      continue;
+    for (const MachineBasicBlock *Succ : Pred->successors()) {
+      if (MBBInfos[Succ].MayReachVMEMLoad)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+  return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+  const unsigned HighPriority = 3;
+  const unsigned LowPriority = 0;
+
+  Function &F = MF.getFunction();
+  if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+
+  MBBInfoSet MBBInfos;
+  SmallVector<const MachineBasicBlock *, 16> Worklist;
+  for (MachineBasicBlock &MBB : MF) {
+    if (any_of(MBB, isVMEMLoad))
+      Worklist.push_back(&MBB);
+  }
+
+  // Mark blocks from which control may reach VMEM loads.
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+    MBBInfo &Info = MBBInfos[MBB];
+    if (!Info.MayReachVMEMLoad) {
+      Info.MayReachVMEMLoad = true;
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    }
+  }
+
+  MachineBasicBlock &Entry = MF.front();
+  if (!MBBInfos[&Entry].MayReachVMEMLoad)
+    return false;
+
+  // Raise the priority at the beginning of the shader.
+  MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+  while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+    ++I;
+  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+  // Lower the priority on edges where control leaves blocks from which
+  // VMEM loads are reachable.
+  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBInfos[&MBB].MayReachVMEMLoad) {
+      if (MBB.succ_empty())
+        PriorityLoweringBlocks.insert(&MBB);
+      continue;
+    }
+
+    if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (MBBInfos[Pred].MayReachVMEMLoad)
+          PriorityLoweringBlocks.insert(Pred);
+      }
+      continue;
+    }
+
+    // Where lowering the priority in predecessors is not possible, the
+    // block receiving control either was not part of a loop in the first
+    // place or the loop simplification/canonicalization pass should have
+    // already tried to split the edge and insert a preheader, and if for
+    // whatever reason it failed to do so, then this leaves us with the
+    // only option of lowering the priority within the loop.
+    PriorityLoweringBlocks.insert(&MBB);
+  }
+
+  for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+    while (I != B) {
+      if (isVMEMLoad(*--I)) {
+        ++I;
+        break;
+      }
+    }
+    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+  }
+
+  return true;
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8db5d63f290b1..afc8fd3a9f1f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -277,6 +277,10 @@ EnableDCEInRA("amdgpu-dce-in-ra",
     cl::init(true), cl::Hidden,
     cl::desc("Enable machine DCE inside regalloc"));
 
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
 static cl::opt<bool> EnableScalarIRPasses(
   "amdgpu-scalar-ir-passes",
   cl::desc("Enable scalar IR passes"),
@@ -1360,6 +1364,8 @@ void GCNPassConfig::addPreEmitPass() {
     addPass(&SIInsertHardClausesID);
 
   addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 586c70797dad0..39685c33b0397 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUReplaceLDSUseWithPointer.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
+  AMDGPUSetWavePriority.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp

diff  --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
new file mode 100644
index 0000000000000..ed7dc6239e35e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -0,0 +1,153 @@
+; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
+; RUN:   FileCheck %s
+
+; CHECK-LABEL: no_setprio:
+; CHECK-NOT:       s_setprio
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @no_setprio() {
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+; CHECK-LABEL: vmem_in_exit_block:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: branch:
+; CHECK:           s_setprio 3
+; CHECK:           s_cbranch_scc0 [[A:.*]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[A]]:  ; %a
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK-NEXT:  [[EXIT]]:
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> <float 0.0, float 0.0>
+
+b:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: setprio_follows_setprio:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK:           s_cbranch_scc1 [[C:.*]]
+; CHECK:       {{.*}}:  ; %a
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_cbranch_scc1 [[C]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[C]]:  ; %c
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK:       [[EXIT]]:
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+entry:
+  %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond1 = icmp ne i32 %i, 0
+  br i1 %cond1, label %a, label %c
+
+a:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %cond2 = icmp ne i32 %i, 1
+  br i1 %cond2, label %b, label %c
+
+b:
+  ret <2 x float> %v2
+
+c:
+  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
+  %v4 = fadd <2 x float> %v1, %v3
+  ret <2 x float> %v4
+}
+
+; CHECK-LABEL: loop:
+; CHECK:       {{.*}}:  ; %entry
+; CHECK:           s_setprio 3
+; CHECK-NOT:       s_setprio
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK-NEXT:  {{.*}}:  ; %exit
+; CHECK-NEXT:      s_setprio 0
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
+
+  %i2 = add i32 %i, 1
+
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
+  %sum2 = fadd <2 x float> %sum, %v
+
+  %cond = icmp ult i32 %i2, 5
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret <2 x float> %sum2
+}
+
+; CHECK-LABEL: edge_split:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
+; CHECK:       {{.*}}:  ; %loop.preheader
+; CHECK-NEXT:      s_setprio 0
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK        {{.*}}:  ; %exit
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[RET:.*]]
+; CHECK:       [[ANOTHER_LOAD]]:  ; %another_load
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[RET]]
+; CHECK:       [[RET]]:
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+entry:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %loop, label %another_load
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
+
+  %i2 = add i32 %i, 1
+  %mul2 = fmul <2 x float> %mul, %v
+
+  %cond2 = icmp ult i32 %i2, 5
+  br i1 %cond2, label %loop, label %exit
+
+exit:
+  ret <2 x float> %mul2
+
+another_load:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %sum = fadd <2 x float> %v, %v2
+  ret <2 x float> %sum
+}
+
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind