[clang] [llvm] reduce max wrt divergent mask (PR #135138)
Aniket Lal via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 22 01:27:37 PDT 2025
https://github.com/lalaniket8 updated https://github.com/llvm/llvm-project/pull/135138
>From a4b91e537441d9edba0d39d21eab7e150a066049 Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Tue, 22 Apr 2025 13:52:27 +0530
Subject: [PATCH] reduce builtin compiler implementation
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 18 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 204 ++++++++++++++++++-
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ++
5 files changed, 244 insertions(+), 6 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 39fef9e4601f8..11765a113a518 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -366,6 +366,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32, "iiii", "nc")
+
//===----------------------------------------------------------------------===//
// MFMA builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index ad012d98635ff..d6a20d61741d7 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -274,6 +274,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
+static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32:
+ return Intrinsic::amdgcn_wave_reduce_wrt_divergent_mask_umax;
+ default:
+ llvm_unreachable("Unknown BuiltinID for wave reduction");
+ }
+}
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -1179,6 +1188,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
return emitBuiltinWithOneOverloadedType<2>(
*this, E, Intrinsic::amdgcn_s_prefetch_data);
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32: {
+ Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
+ llvm::Value *Value = EmitScalarExpr(E->getArg(0));
+ llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
+ llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
+ // llvm::errs() << "Value->getType():" << Value->getType() << "\n";
+ llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
+ return Builder.CreateCall(F, {Value, Mask, Strategy});
+ }
default:
return nullptr;
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 75068717d9a5f..c155a75852473 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2343,6 +2343,20 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+class AMDGPUWaveReduceWrtDivergentMask<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+ [data_ty],
+ [
+ LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
+ llvm_i32_ty, // Divergent mask
+ llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
+ // 1: Iterative strategy, and
+ // 2. DPP)
+ ],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
+
+def int_amdgcn_wave_reduce_wrt_divergent_mask_umin : AMDGPUWaveReduceWrtDivergentMask;
+def int_amdgcn_wave_reduce_wrt_divergent_mask_umax : AMDGPUWaveReduceWrtDivergentMask;
+
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 724a45062c1f4..f85bcccf6b142 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5030,12 +5030,18 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register SrcReg = MI.getOperand(1).getReg();
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
Register DstReg = MI.getOperand(0).getReg();
+ bool isDstSGPR = TRI->isSGPRClass(MRI.getRegClass(DstReg));
+
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
// Reduced value will be same as given sgpr.
// clang-format off
- BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
+ if(isDstSGPR)
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
+ .addReg(SrcReg);
+ else
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), DstReg)
.addReg(SrcReg);
// clang-format on
RetBB = &BB;
@@ -5051,7 +5057,6 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
-
// Create Control flow for loop
// Split MI's Machine Basic block into For loop
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
@@ -5059,14 +5064,17 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ const TargetRegisterClass *regclass =
+ isDstSGPR ? DstRegClass : &AMDGPU::SReg_32RegClass;
+ Register accumreg = MRI.createVirtualRegister(regclass);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+ Register InitalValReg = MRI.createVirtualRegister(regclass);
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register AccumulatorReg = MRI.createVirtualRegister(regclass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register FF1Reg = MRI.createVirtualRegister(regclass);
Register LaneValueReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@@ -5106,10 +5114,14 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
.addReg(SrcReg)
.addReg(FF1->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), accumreg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL,
+ TII->get(isDstSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32),
+ DstReg)
+ .addReg(accumreg);
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
@@ -5138,6 +5150,171 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
return RetBB;
}
+static MachineBasicBlock *
+lowerWaveReduceWrtDivergentMask(MachineInstr &MI, MachineBasicBlock &BB,
+ const GCNSubtarget &ST, unsigned Opc) {
+ MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ // const MachineFunction *MF = BB.getParent();
+ // const TargetRegisterInfo *TrgtRegInfo =
+ // MF->getSubtarget().getRegisterInfo(); Reduction operations depend on
+ // whether the input operand is SGPR or VGPR.
+ Register SrcReg = MI.getOperand(1).getReg();
+ auto SrcRegClass = MRI.getRegClass(SrcReg);
+ // llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
+ Register DstReg = MI.getOperand(0).getReg();
+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) <<
+ // "\n";
+ Register DivergentMaskReg = MI.getOperand(2).getReg();
+ // bool isMaskRegUniform =
+ // TRI->isSGPRClass(MRI.getRegClass(DivergentMaskReg)); llvm::errs() <<
+ // TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
+
+ // if (isMaskRegUniform)
+ // return lowerWaveReduce(MI, BB, ST, Opc);
+
+ MachineBasicBlock *RetBB = nullptr;
+ if (isSGPR) {
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ } else {
+
+ MachineBasicBlock::iterator I = BB.end();
+
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
+ auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
+
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+ Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
+ Register AccReg = MRI.createVirtualRegister(DstRegClass);
+ Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register UpdatedActiveLanesReg =
+ MRI.createVirtualRegister(WaveMaskRegClass);
+ Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
+ Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+ Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+
+ bool IsWave32 = ST.isWave32();
+
+ uint32_t IdentityValue =
+ (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+
+ BuildMI(BB, I, DL,
+ TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
+ ExecCopyReg)
+ .addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+
+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
+ .addImm(IdentityValue);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
+ .addImm(0);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+ I = ComputeLoop->end();
+
+ auto PhiActiveLanesInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
+ .addReg(ExecCopyReg)
+ .addMBB(&BB);
+ auto PhiAccInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
+ .addReg(AccReg)
+ .addMBB(&BB);
+ auto PhiBPermAddrInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
+ .addReg(InitialBPermAddrReg)
+ .addMBB(&BB);
+
+ BuildMI(*ComputeLoop, I, DL,
+ TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64),
+ FF1ActiveLanesReg)
+ .addReg(ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
+ .addReg(SrcReg)
+ .addReg(FF1ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
+ .addReg(DivergentMaskReg)
+ .addReg(FF1ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg)
+ .addReg(MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
+ .addReg(AccReg1)
+ .addReg(FF1MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg)
+ .addReg(AccSGPRReg)
+ .addReg(ValReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(FF1MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
+ UpdatedAccReg)
+ .addReg(UpdatedAccSGPRReg)
+ .addReg(AMDGPU::M0)
+ .addReg(AccReg1);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
+ .addReg(FF1MaskReg)
+ .addImm(2);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(FF1ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32),
+ UpdatedBPermAddrReg)
+ .addReg(FF1MaskX4Reg)
+ .addReg(AMDGPU::M0)
+ .addReg(BPermAddrReg);
+
+ unsigned BITSETOpc =
+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
+ .addReg(FF1ActiveLanesReg)
+ .addReg(ActiveLanesReg);
+
+ PhiActiveLanesInst.addReg(UpdatedActiveLanesReg).addMBB(ComputeLoop);
+ PhiAccInst.addReg(UpdatedAccReg).addMBB(ComputeLoop);
+ PhiBPermAddrInst.addReg(UpdatedBPermAddrReg).addMBB(ComputeLoop);
+
+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ .addReg(UpdatedActiveLanesReg)
+ .addImm(0);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(ComputeLoop);
+
+ BuildMI(*ComputeEnd, ComputeEnd->begin(), DL,
+ TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+ .addReg(UpdatedBPermAddrReg)
+ .addReg(UpdatedAccReg)
+ .addImm(0);
+
+ RetBB = ComputeEnd;
+ }
+ MI.eraseFromParent();
+ return RetBB;
+}
+
MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -5151,6 +5328,21 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32:
+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32: {
+ unsigned Opc = (MI.getOpcode() ==
+ AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32)
+ ? AMDGPU::S_MIN_U32
+ : AMDGPU::S_MAX_U32;
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ bool isMaskRegUniform = getSubtarget()->getRegisterInfo()->isSGPRClass(
+ MRI.getRegClass(MI.getOperand(2).getReg()));
+
+ if (isMaskRegUniform)
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), Opc);
+
+ return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), Opc);
+ }
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ed45cf8851146..7e1010b0f8567 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -315,6 +315,18 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
}
}
+let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umin i32:$src, i32:$mask, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umax i32:$src, i32:$mask, i32:$strategy))]> {
+ }
+}
+
let usesCustomInserter = 1, Defs = [VCC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
More information about the llvm-commits
mailing list