[clang] [llvm] reduce max wrt divergent mask (PR #135138)
Aniket Lal via cfe-commits
cfe-commits at lists.llvm.org
Wed Apr 9 23:55:59 PDT 2025
https://github.com/lalaniket8 created https://github.com/llvm/llvm-project/pull/135138
None
>From 810ad9859bbcd66d6942e497c25bdde27978bf3c Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Wed, 9 Apr 2025 11:24:21 +0530
Subject: [PATCH] reduce max wrt divergent mask
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 18 +++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 157 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ++
5 files changed, 203 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index cbef637be213a..642c25f6a0bff 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -366,6 +366,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32, "iiii", "nc")
+
//===----------------------------------------------------------------------===//
// MFMA builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..15c0eeab4a78c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -284,6 +284,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
+static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32:
+ return Intrinsic::amdgcn_wave_reduce_wrt_divergent_mask_umax;
+ default:
+ llvm_unreachable("Unknown BuiltinID for wave reduction");
+ }
+}
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -1142,6 +1151,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_s_prefetch_data:
return emitBuiltinWithOneOverloadedType<2>(
*this, E, Intrinsic::amdgcn_s_prefetch_data);
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32:{
+ Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
+ llvm::Value *Value = EmitScalarExpr(E->getArg(0));
+ llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
+ llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
+ // llvm::errs() << "Value->getType():" << Value->getType() << "\n";
+ llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
+ return Builder.CreateCall(F, {Value, Mask, Strategy});
+ }
default:
return nullptr;
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index ae2f6e62c0272..b0a4ab04f4ca8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2349,6 +2349,20 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
+class AMDGPUWaveReduceWrtDivergentMask<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
+ [data_ty],
+ [
+ LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
+ llvm_i32_ty, // Divergent mask
+ llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
+ // 1: Iterative strategy, and
+ // 2. DPP)
+ ],
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
+
+def int_amdgcn_wave_reduce_wrt_divergent_mask_umin : AMDGPUWaveReduceWrtDivergentMask;
+def int_amdgcn_wave_reduce_wrt_divergent_mask_umax : AMDGPUWaveReduceWrtDivergentMask;
+
def int_amdgcn_readfirstlane :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 356040da95672..83ccba7cf1481 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5143,6 +5143,159 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
return RetBB;
}
+static MachineBasicBlock *lowerWaveReduceWrtDivergentMask(MachineInstr &MI,
+ MachineBasicBlock &BB,
+ const GCNSubtarget &ST,
+ unsigned Opc) {
+MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
+const SIRegisterInfo *TRI = ST.getRegisterInfo();
+const DebugLoc &DL = MI.getDebugLoc();
+const SIInstrInfo *TII = ST.getInstrInfo();
+// const MachineFunction *MF = BB.getParent();
+// const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
+// Reduction operations depend on whether the input operand is SGPR or VGPR.
+Register SrcReg = MI.getOperand(1).getReg();
+auto SrcRegClass = MRI.getRegClass(SrcReg);
+// llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
+bool isSGPR = TRI->isSGPRClass(SrcRegClass);
+Register DstReg = MI.getOperand(0).getReg();
+// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
+Register DivergentMaskReg = MI.getOperand(2).getReg();
+// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
+
+MachineBasicBlock *RetBB = nullptr;
+if (isSGPR) {
+BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
+.addReg(SrcReg);
+RetBB = &BB;
+} else {
+
+MachineBasicBlock::iterator I = BB.end();
+
+auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
+auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
+
+const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
+Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
+Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
+Register AccReg = MRI.createVirtualRegister(DstRegClass);
+Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
+Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
+Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
+Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
+Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
+Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
+Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+
+bool IsWave32 = ST.isWave32();
+
+uint32_t IdentityValue =
+(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+
+BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+
+BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
+.addImm(IdentityValue);
+BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
+.addImm(0);
+BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
+.addMBB(ComputeLoop);
+
+I = ComputeLoop->end();
+
+auto PhiActiveLanesInst =
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
+.addReg(ExecCopyReg)
+.addMBB(&BB);
+auto PhiAccInst =
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
+.addReg(AccReg)
+.addMBB(&BB);
+auto PhiBPermAddrInst =
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
+.addReg(InitialBPermAddrReg)
+.addMBB(&BB);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg)
+.addReg(ActiveLanesReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
+.addReg(SrcReg)
+.addReg(FF1ActiveLanesReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
+.addReg(DivergentMaskReg)
+.addReg(FF1ActiveLanesReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
+.addReg(AccReg1)
+.addReg(FF1MaskReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+.addReg(FF1MaskReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg)
+.addReg(UpdatedAccSGPRReg)
+.addReg(AMDGPU::M0)
+.addReg(AccReg1);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
+.addReg(FF1MaskReg)
+.addImm(2);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+.addReg(FF1ActiveLanesReg);
+
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg)
+.addReg(FF1MaskX4Reg)
+.addReg(AMDGPU::M0)
+.addReg(BPermAddrReg);
+
+unsigned BITSETOpc =
+IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
+.addReg(FF1ActiveLanesReg)
+.addReg(ActiveLanesReg);
+
+PhiActiveLanesInst.addReg(UpdatedActiveLanesReg)
+.addMBB(ComputeLoop);
+PhiAccInst.addReg(UpdatedAccReg)
+.addMBB(ComputeLoop);
+PhiBPermAddrInst.addReg(UpdatedBPermAddrReg)
+.addMBB(ComputeLoop);
+
+unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+.addReg(UpdatedActiveLanesReg)
+.addImm(0);
+BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+.addMBB(ComputeLoop);
+
+BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+.addReg(UpdatedBPermAddrReg)
+.addReg(UpdatedAccReg)
+.addImm(0);
+
+RetBB = ComputeEnd;
+
+}
+MI.eraseFromParent();
+return RetBB;
+}
+
MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -5156,6 +5309,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32:
+ return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32:
+ return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9051db0c01ed1..a2ddfdf5be125 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -315,6 +315,18 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
}
}
+let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umin i32:$src, i32:$mask, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umax i32:$src, i32:$mask, i32:$strategy))]> {
+ }
+}
+
let usesCustomInserter = 1, Defs = [VCC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
More information about the cfe-commits
mailing list