[llvm] reduce over divergent wave (PR #133228)
Aniket Lal via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 27 03:22:24 PDT 2025
https://github.com/lalaniket8 created https://github.com/llvm/llvm-project/pull/133228
None
>From c5c1cc54524d839f148c1390b659772fcabc0a4a Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Thu, 27 Mar 2025 15:50:13 +0530
Subject: [PATCH] reduce over divergent wave
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 7 ++--
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 42 ++++++++++++++-----
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +++---
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 16 +++----
newreduceumax.ll | 12 ++++++
5 files changed, 62 insertions(+), 27 deletions(-)
create mode 100644 newreduceumax.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..9fe327cec13d1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2313,15 +2313,16 @@ def int_amdgcn_s_quadmask :
def int_amdgcn_s_wqm :
DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
-class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+class AMDGPUWaveReduce<LLVMType data_ty = llvm_i32_ty> : Intrinsic<
[data_ty],
[
- LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
+ llvm_i32_ty, // llvm value to reduce (SGPR/VGPR),
+ llvm_i64_ty, // Divergent mask
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
// 1: Iterative strategy, and
// 2. DPP)
],
- [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9743320601ed4..8bca356327896 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4970,11 +4970,21 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
const SIInstrInfo *TII = ST.getInstrInfo();
-
+ const MachineFunction *MF = BB.getParent();
+ const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
// Reduction operations depend on whether the input operand is SGPR or VGPR.
Register SrcReg = MI.getOperand(1).getReg();
- bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
+ auto SrcRegClass = MRI.getRegClass(SrcReg);
+ llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
Register DstReg = MI.getOperand(0).getReg();
+ llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
+ Register MaskReg = MI.getOperand(2).getReg();
+ llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
+
+ // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg)) << "\n";
+ // llvm::errs() << "DstReg:" << MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n";
+ // llvm::errs() << "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
@@ -5005,15 +5015,19 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+ Register InitalValReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register TempRegMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register FF1MaskReg = MRI.createVirtualRegister(DstRegClass);
Register LaneValueReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register MaskLaneValueReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
bool IsWave32 = ST.isWave32();
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -5024,9 +5038,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
+ BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); //s_mov_b64 s[2:3], exec
+ // auto TmpMaskSReg =
+ // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg); //s_mov_b64 s[2:3], exec
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
+ .addImm(InitalValue);//s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
// clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
@@ -5046,14 +5062,20 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
+ .addReg(ActiveBits->getOperand(0).getReg());//%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
.addReg(SrcReg)
- .addReg(FF1->getOperand(0).getReg());
+ .addReg(FF1->getOperand(0).getReg());//%value_at_lane_index.sreg = V_READLANE %value.vgpr %index.sgpr
+ auto MaskLaneValue = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READLANE_B32), MaskLaneValueReg)
+ .addReg(MaskReg)
+ .addReg(FF1->getOperand(0).getReg());//%mask_at_lane_index.sreg = V_READLANE %mask.vgpr %index.sgpr
+ auto FF2 = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
+ .addReg(MaskLaneValue->getOperand(0).getReg());//%subgroupindex.sgpr = S_FF1_I32_B64 %mask_at_lane_index.sreg
auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());
+ .addReg(LaneValue->getOperand(0).getReg());//%acc.sgpr = max %acc.sgpr %value_at_lane_index.sreg
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
@@ -5061,7 +5083,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto NewActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
.addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
+ .addReg(ActiveBits->getOperand(0).getReg());//%bitsetresult = S_BITSET0_B64 %exec_copy
// Add phi nodes
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index de77401eb0137..6b651971375bc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+ def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b64: $mask, VSrc_b32:$strategy),
+ [(i32 (int_amdgcn_wave_reduce_umin i32:$src, i64:$mask, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
+ def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b64: $mask, VSrc_b32:$strategy),
+ [(i32 (int_amdgcn_wave_reduce_umax i32:$src, i64:$mask, i32:$strategy))]> {
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index deeceed3a19be..f85b94198c390 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -12,7 +12,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
-declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
@@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
; GFX11GISEL: ; %bb.0: ; %entry
; GFX11GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_endpgm
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -937,11 +937,11 @@ entry:
br i1 %d_cmp, label %if, label %else
if:
- %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1)
br label %endif
else:
- %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1)
br label %endif
endif:
diff --git a/newreduceumax.ll b/newreduceumax.ll
new file mode 100644
index 0000000000000..f6d95374b0e99
--- /dev/null
+++ b/newreduceumax.ll
@@ -0,0 +1,12 @@
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, ptr addrspace(1) %maskarr, i32 %in) {
+
+ entry:
+ %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+ %mask_ptr = getelementptr inbounds i32, ptr addrspace(1) %maskarr, i32 %id.x
+ ; %mask_ptr_casted = bitcast ptr addrspace(1) %mask_ptr to ptr
+ %mask = load i32, ptr addrspace(1) %mask_ptr
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %mask, i32 1)
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+
+}
\ No newline at end of file
More information about the llvm-commits
mailing list