[llvm] 57c943d - [AMDGPU] Only raise wave priority if there is a long enough sequence of VALU instructions.
Ivan Kosarev via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 8 07:21:56 PDT 2022
Author: Ivan Kosarev
Date: 2022-09-08T15:21:30+01:00
New Revision: 57c943d5811df35c50b6f9b6068d14d9bfd8b338
URL: https://github.com/llvm/llvm-project/commit/57c943d5811df35c50b6f9b6068d14d9bfd8b338
DIFF: https://github.com/llvm/llvm-project/commit/57c943d5811df35c50b6f9b6068d14d9bfd8b338.diff
LOG: [AMDGPU] Only raise wave priority if there is a long enough sequence of VALU instructions.
Reviewed By: nhaehnle
Differential Revision: https://reviews.llvm.org/D124671
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
index 34702ee6623bb..da2a2b9353215 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -26,11 +26,18 @@ using namespace llvm;
#define DEBUG_TYPE "amdgpu-set-wave-priority"
+static cl::opt<unsigned> DefaultVALUInstsThreshold(
+ "amdgpu-set-wave-priority-valu-insts-threshold",
+ cl::desc("VALU instruction count threshold for adjusting wave priority"),
+ cl::init(100), cl::Hidden);
+
namespace {
struct MBBInfo {
MBBInfo() = default;
+ unsigned NumVALUInstsAtStart = 0;
bool MayReachVMEMLoad = false;
+ MachineInstr *LastVMEMLoad = nullptr;
};
using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
@@ -46,7 +53,9 @@ class AMDGPUSetWavePriority : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
private:
- MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+ MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned priority) const;
const SIInstrInfo *TII;
};
@@ -62,9 +71,12 @@ FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
return new AMDGPUSetWavePriority();
}
-MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
- unsigned priority) const {
- return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+MachineInstr *
+AMDGPUSetWavePriority::BuildSetprioMI(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned priority) const {
+ return BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETPRIO))
+ .addImm(priority);
}
// Checks that for every predecessor Pred that can reach a VMEM load,
@@ -97,21 +109,58 @@ bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
+ unsigned VALUInstsThreshold = DefaultVALUInstsThreshold;
+ Attribute A = F.getFnAttribute("amdgpu-wave-priority-threshold");
+ if (A.isValid())
+ A.getValueAsString().getAsInteger(0, VALUInstsThreshold);
+
+ // Find VMEM loads that may be executed before long-enough sequences of
+ // VALU instructions. We currently assume that backedges/loops, branch
+ // probabilities and other details can be ignored, so we essentially
+ // determine the largest number of VALU instructions along every
+ // possible path from the start of the function that may potentially be
+ // executed provided no backedge is ever taken.
MBBInfoSet MBBInfos;
- SmallVector<const MachineBasicBlock *, 16> Worklist;
- for (MachineBasicBlock &MBB : MF) {
- if (any_of(MBB, isVMEMLoad))
- Worklist.push_back(&MBB);
- }
-
- // Mark blocks from which control may reach VMEM loads.
- while (!Worklist.empty()) {
- const MachineBasicBlock *MBB = Worklist.pop_back_val();
+ for (MachineBasicBlock *MBB : post_order(&MF)) {
MBBInfo &Info = MBBInfos[MBB];
- if (!Info.MayReachVMEMLoad) {
- Info.MayReachVMEMLoad = true;
- Worklist.append(MBB->pred_begin(), MBB->pred_end());
+ bool AtStart = true;
+ unsigned MaxNumVALUInstsInMiddle = 0;
+ unsigned NumVALUInstsAtEnd = 0;
+ for (MachineInstr &MI : *MBB) {
+ if (isVMEMLoad(MI)) {
+ AtStart = false;
+ Info.NumVALUInstsAtStart = 0;
+ MaxNumVALUInstsInMiddle = 0;
+ NumVALUInstsAtEnd = 0;
+ Info.LastVMEMLoad = &MI;
+ } else if (SIInstrInfo::isDS(MI)) {
+ AtStart = false;
+ MaxNumVALUInstsInMiddle =
+ std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd);
+ NumVALUInstsAtEnd = 0;
+ } else if (SIInstrInfo::isVALU(MI)) {
+ if (AtStart)
+ ++Info.NumVALUInstsAtStart;
+ ++NumVALUInstsAtEnd;
+ }
}
+
+ bool SuccsMayReachVMEMLoad = false;
+ unsigned NumFollowingVALUInsts = 0;
+ for (const MachineBasicBlock *Succ : MBB->successors()) {
+ SuccsMayReachVMEMLoad |= MBBInfos[Succ].MayReachVMEMLoad;
+ NumFollowingVALUInsts =
+ std::max(NumFollowingVALUInsts, MBBInfos[Succ].NumVALUInstsAtStart);
+ }
+ if (AtStart)
+ Info.NumVALUInstsAtStart += NumFollowingVALUInsts;
+ NumVALUInstsAtEnd += NumFollowingVALUInsts;
+
+ unsigned MaxNumVALUInsts =
+ std::max(MaxNumVALUInstsInMiddle, NumVALUInstsAtEnd);
+ Info.MayReachVMEMLoad =
+ SuccsMayReachVMEMLoad ||
+ (Info.LastVMEMLoad && MaxNumVALUInsts >= VALUInstsThreshold);
}
MachineBasicBlock &Entry = MF.front();
@@ -122,10 +171,10 @@ bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
++I;
- Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+ BuildSetprioMI(Entry, I, HighPriority);
// Lower the priority on edges where control leaves blocks from which
- // VMEM loads are reachable.
+ // the VMEM loads are reachable.
SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
for (MachineBasicBlock &MBB : MF) {
if (MBBInfos[&MBB].MayReachVMEMLoad) {
@@ -152,14 +201,12 @@ bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
}
for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
- MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
- while (I != B) {
- if (isVMEMLoad(*--I)) {
- ++I;
- break;
- }
- }
- MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+ BuildSetprioMI(
+ *MBB,
+ MBBInfos[MBB].LastVMEMLoad
+ ? std::next(MachineBasicBlock::iterator(MBBInfos[MBB].LastVMEMLoad))
+ : MBB->begin(),
+ LowPriority);
}
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
index ed7dc6239e35e..eb720b2ec6fa6 100644
--- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -4,8 +4,9 @@
; CHECK-LABEL: no_setprio:
; CHECK-NOT: s_setprio
; CHECK: ; return to shader part epilog
-define amdgpu_ps <2 x float> @no_setprio() {
- ret <2 x float> <float 0.0, float 0.0>
+define amdgpu_ps <2 x float> @no_setprio(<2 x float> %a, <2 x float> %b) "amdgpu-wave-priority-threshold"="1" {
+ %s = fadd <2 x float> %a, %b
+ ret <2 x float> %s
}
; CHECK-LABEL: vmem_in_exit_block:
@@ -13,9 +14,10 @@ define amdgpu_ps <2 x float> @no_setprio() {
; CHECK: buffer_load_dwordx2
; CHECK-NEXT: s_setprio 0
; CHECK: ; return to shader part epilog
-define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" {
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
- ret <2 x float> %v
+ %s = fadd <2 x float> %v, %x
+ ret <2 x float> %s
}
; CHECK-LABEL: branch:
@@ -29,7 +31,7 @@ define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
; CHECK-NEXT: s_setprio 0
; CHECK: s_branch [[EXIT]]
; CHECK-NEXT: [[EXIT]]:
-define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i, <2 x float> %x) "amdgpu-wave-priority-threshold"="2" {
%cond = icmp eq i32 %i, 0
br i1 %cond, label %a, label %b
@@ -38,7 +40,8 @@ a:
b:
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
- ret <2 x float> %v
+ %s = fadd <2 x float> %v, %x
+ ret <2 x float> %s
}
; CHECK-LABEL: setprio_follows_setprio:
@@ -48,7 +51,7 @@ b:
; CHECK: {{.*}}: ; %a
; CHECK: buffer_load_dwordx2
; CHECK-NEXT: s_setprio 0
-; CHECK: s_cbranch_scc1 [[C]]
+; CHECK: s_cbranch_vccnz [[C]]
; CHECK: {{.*}}: ; %b
; CHECK-NOT: s_setprio
; CHECK: s_branch [[EXIT:.*]]
@@ -56,7 +59,7 @@ b:
; CHECK-NEXT: s_setprio 0
; CHECK: s_branch [[EXIT]]
; CHECK: [[EXIT]]:
-define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="3" {
entry:
%v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
%cond1 = icmp ne i32 %i, 0
@@ -64,15 +67,16 @@ entry:
a:
%v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
- %cond2 = icmp ne i32 %i, 1
+ %v20 = extractelement <2 x float> %v2, i32 0
+ %v21 = extractelement <2 x float> %v2, i32 1
+ %cond2 = fcmp ult float %v20, %v21
br i1 %cond2, label %b, label %c
b:
ret <2 x float> %v2
c:
- %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
- %v4 = fadd <2 x float> %v1, %v3
+ %v4 = fadd <2 x float> %v1, %v1
ret <2 x float> %v4
}
@@ -87,7 +91,7 @@ c:
; CHECK: s_cbranch_scc1 [[LOOP]]
; CHECK-NEXT: {{.*}}: ; %exit
; CHECK-NEXT: s_setprio 0
-define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) "amdgpu-wave-priority-threshold"="2" {
entry:
br label %loop
@@ -125,7 +129,7 @@ exit:
; CHECK-NEXT: s_setprio 0
; CHECK: s_branch [[RET]]
; CHECK: [[RET]]:
-define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) "amdgpu-wave-priority-threshold"="2" {
entry:
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
%cond = icmp ne i32 %x, 0
@@ -150,4 +154,53 @@ another_load:
ret <2 x float> %sum
}
+; CHECK-LABEL: valu_insts_threshold:
+; CHECK: s_setprio 3
+; CHECK: buffer_load_dwordx2
+; CHECK-NEXT: s_setprio 0
+; CHECK-COUNT-4: v_add_f32_e32
+; CHECK: s_cbranch_scc0 [[A:.*]]
+; CHECK: {{.*}}: ; %b
+; CHECK-NEXT: buffer_load_dwordx2
+; CHECK: s_branch [[END:.*]]
+; CHECK: [[A]]: ; %a
+; CHECK: s_branch [[END]]
+; CHECK: [[END]]:
+define amdgpu_ps <2 x float> @valu_insts_threshold(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="4" {
+ %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+ %add = fadd <2 x float> %v, %v
+ %add2 = fadd <2 x float> %add, %add
+
+ %cond = icmp eq i32 %i, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ ret <2 x float> %add2
+
+b:
+ %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0)
+ %sub = fsub <2 x float> %add2, %v2
+ ret <2 x float> %sub
+}
+
+; CHECK-LABEL: valu_insts_threshold2:
+; CHECK-NOT: s_setprio
+; CHECK: ; -- End function
+define amdgpu_ps <2 x float> @valu_insts_threshold2(<4 x i32> inreg %p, i32 inreg %i) "amdgpu-wave-priority-threshold"="5" {
+ %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+ %add = fadd <2 x float> %v, %v
+ %add2 = fadd <2 x float> %add, %add
+
+ %cond = icmp eq i32 %i, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ ret <2 x float> %add2
+
+b:
+ %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 1, i32 0, i32 0)
+ %sub = fsub <2 x float> %add2, %v2
+ ret <2 x float> %sub
+}
+
declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind
More information about the llvm-commits
mailing list