[clang] [llvm] reduce over divergent mask (PR #133228)
Aniket Lal via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 9 23:52:34 PDT 2025
https://github.com/lalaniket8 updated https://github.com/llvm/llvm-project/pull/133228
>From c8989dc07dec2af1ecc7e8fd07e422e760d3bfb6 Mon Sep 17 00:00:00 2001
From: anikelal <anikelal at amd.com>
Date: Tue, 8 Apr 2025 14:14:10 +0530
Subject: [PATCH] reduce wrt divergent mask
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 +
clang/lib/CodeGen/CGBuiltin.cpp | 18 ++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 5 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 171 +++++++++++-------
llvm/lib/Target/AMDGPU/SIInstructions.td | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 16 +-
6 files changed, 141 insertions(+), 84 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 44ef404aee72f..762e74461a835 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -363,6 +363,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc")
BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_mask_max_i32, "iiii", "nc")
+
//===----------------------------------------------------------------------===//
// MFMA builtins.
//===----------------------------------------------------------------------===//
@@ -620,5 +622,6 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c126f88b9e3a5..7d07ea0b45f68 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20053,6 +20053,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
+static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:
+ return Intrinsic::amdgcn_wave_reduce_umax;
+ default:
+ llvm_unreachable("Unknown BuiltinID for wave reduction");
+ }
+}
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
@@ -20360,6 +20369,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_mask_max_i32:{
+ Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
+ llvm::Value *Value = EmitScalarExpr(E->getArg(0));
+ llvm::Value *Mask = EmitScalarExpr(E->getArg(1));
+ llvm::Value *Strategy = EmitScalarExpr(E->getArg(2));
+ // llvm::errs() << "Value->getType():" << Value->getType() << "\n";
+ llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
+ return Builder.CreateCall(F, {Value, Mask, Strategy});
+ }
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 86e050333acc7..b85648e6c3077 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2316,12 +2316,13 @@ def int_amdgcn_s_wqm :
class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
[data_ty],
[
- LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR)
+ LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR),
+ llvm_i32_ty, // Divergent mask
llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default,
// 1: Iterative strategy, and
// 2. DPP)
],
- [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+ [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>;
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9743320601ed4..e39dd79c4fd62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4970,114 +4970,149 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const DebugLoc &DL = MI.getDebugLoc();
const SIInstrInfo *TII = ST.getInstrInfo();
-
+ // const MachineFunction *MF = BB.getParent();
+ // const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo();
// Reduction operations depend on whether the input operand is SGPR or VGPR.
Register SrcReg = MI.getOperand(1).getReg();
- bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
+ auto SrcRegClass = MRI.getRegClass(SrcReg);
+ // llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n";
+ bool isSGPR = TRI->isSGPRClass(SrcRegClass);
Register DstReg = MI.getOperand(0).getReg();
+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
+ Register DivergentMaskReg = MI.getOperand(2).getReg();
+ // llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n";
+
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
- // These operations with a uniform value i.e. SGPR are idempotent.
- // Reduced value will be same as given sgpr.
- // clang-format off
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg)
.addReg(SrcReg);
- // clang-format on
RetBB = &BB;
} else {
- // TODO: Implement DPP Strategy and switch based on immediate strategy
- // operand. For now, for all the cases (default, Iterative and DPP we use
- // iterative approach by default.)
-
- // To reduce the VGPR using iterative approach, we need to iterate
- // over all the active lanes. Lowering consists of ComputeLoop,
- // which iterate over only active lanes. We use copy of EXEC register
- // as induction variable and every active lane modifies it using bitset0
- // so that we will get the next active lane for next iteration.
+
MachineBasicBlock::iterator I = BB.end();
- Register SrcReg = MI.getOperand(1).getReg();
- // Create Control flow for loop
- // Split MI's Machine Basic block into For loop
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
- // Create virtual registers required for lowering.
+ auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass;
+ auto SReg32RegClass = &AMDGPU::SReg_32RegClass;
+
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
- Register LaneValueReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ExecCopyReg1 = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+ Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register AccReg1 = MRI.createVirtualRegister(DstRegClass);
+ Register AccReg = MRI.createVirtualRegister(DstRegClass);
+ Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass);
+ Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass);
+ Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass);
+ Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass);
+ Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass);
bool IsWave32 = ST.isWave32();
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // Create initail values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlockk
- uint32_t InitalValue =
+ uint32_t IdentityValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
- // clang-format off
+
+ BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
+
+ BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg1).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); //%19:sreg_64_xexec = S_MOV_B64 $exec
+
+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg)
+ .addImm(IdentityValue);// %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg)
+ .addImm(0);
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
- // clang-format on
- // Start constructing ComputeLoop
I = ComputeLoop->end();
- auto Accumulator =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(InitalValReg)
- .addMBB(&BB);
- auto ActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(TmpSReg->getOperand(0).getReg())
- .addMBB(&BB);
+ auto PhiActiveLanesInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg)
+ .addReg(ExecCopyReg)
+ .addMBB(&BB);// %25:sreg_64_xexec = PHI %19:sreg_64_xexec, %bb.0, %26:sreg_64_xexec, %bb.1
+ auto PhiAccInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1)
+ .addReg(AccReg)
+ .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
+ auto PhiBPermAddrInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg)
+ .addReg(InitialBPermAddrReg)
+ .addMBB(&BB);//%23:vgpr_32 = PHI %24:vgpr_32, %bb.0, %22:vgpr_32, %bb.1
// Perform the computations
- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
+ BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg)
+ .addReg(ActiveLanesReg);//%27:sreg_32 = S_FF1_I32_B64 %25:sreg_64_xexec
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg)
.addReg(SrcReg)
- .addReg(FF1->getOperand(0).getReg());
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());
+ .addReg(FF1ActiveLanesReg);//%29:sreg_32_xm0 = V_READLANE_B32 %10:vgpr_32, %27:sreg_32
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg)
+ .addReg(DivergentMaskReg)
+ .addReg(FF1ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg)
+ .addReg(AccReg1)
+ .addReg(FF1MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(FF1MaskReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg)
+ .addReg(UpdatedAccSGPRReg)
+ .addReg(AMDGPU::M0)
+ .addReg(AccReg1);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg)
+ .addReg(FF1MaskReg)
+ .addImm(2);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(FF1ActiveLanesReg);
+
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg)
+ .addReg(FF1MaskX4Reg)
+ .addReg(AMDGPU::M0)
+ .addReg(BPermAddrReg);
- // Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
- auto NewActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
- .addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg)
+ .addReg(FF1ActiveLanesReg)
+ .addReg(ActiveLanesReg);
- // Add phi nodes
- Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+ PhiActiveLanesInst.addReg(UpdatedActiveLanesReg)
.addMBB(ComputeLoop);
- ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+ PhiAccInst.addReg(UpdatedAccReg)
+ .addMBB(ComputeLoop);
+ PhiBPermAddrInst.addReg(UpdatedBPermAddrReg)
.addMBB(ComputeLoop);
- // Creating branching
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
- .addReg(NewActiveBits->getOperand(0).getReg())
+ .addReg(UpdatedActiveLanesReg)
.addImm(0);
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
.addMBB(ComputeLoop);
+ BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg)
+ .addReg(UpdatedBPermAddrReg)
+ .addReg(UpdatedAccReg)
+ .addImm(0);
+
RetBB = ComputeEnd;
+
}
MI.eraseFromParent();
return RetBB;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index de77401eb0137..20192647dfeeb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -298,14 +298,14 @@ def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
- def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+ def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$mask, i32:$strategy))]> {
}
- def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
- (ins VSrc_b32: $src, VSrc_b32:$strategy),
- [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
+ def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy),
+ [(set i32:$vdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$mask, i32:$strategy))]> {
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index deeceed3a19be..f85b94198c390 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -12,7 +12,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
-declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
+declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32, i32 immarg)
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
@@ -122,12 +122,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 15, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
-define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -218,7 +218,7 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1132GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -256,7 +256,7 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
; GFX11GISEL: ; %bb.0: ; %entry
; GFX11GISEL-NEXT: s_endpgm
entry:
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 poison, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -499,7 +499,7 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_endpgm
entry:
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
- %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
+ %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 %in, i32 1)
store i32 %result, ptr addrspace(1) %out
ret void
}
@@ -937,11 +937,11 @@ entry:
br i1 %d_cmp, label %if, label %else
if:
- %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
+ %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 %in, i32 1)
br label %endif
else:
- %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
+ %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 %in, i32 1)
br label %endif
endif:
More information about the llvm-commits
mailing list