[llvm] [AMDGPU] DPP implementations for Wave Reduction (PR #185814)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 11 02:54:01 PDT 2026
================
@@ -5915,267 +5945,450 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
}
} else {
- // TODO: Implement DPP Strategy and switch based on immediate strategy
- // operand. For now, for all the cases (default, Iterative and DPP we use
- // iterative approach by default.)
-
- // To reduce the VGPR using iterative approach, we need to iterate
- // over all the active lanes. Lowering consists of ComputeLoop,
- // which iterate over only active lanes. We use copy of EXEC register
- // as induction variable and every active lane modifies it using bitset0
- // so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
-
- // Create Control flow for loop
- // Split MI's Machine Basic block into For loop
- auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
-
// Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
-
+ const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
bool IsWave32 = ST.isWave32();
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
- // Create initial values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlock
- BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
- if (is32BitOpc) {
- uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
- .addImm(IdentityValue);
- } else {
- uint64_t IdentityValue =
- MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
- ? 0x0 // +0.0 for double sub reduction
- : getIdentityValueFor64BitWaveReduction(Opc);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
- .addImm(IdentityValue);
- }
- // clang-format off
+ if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
+ !ST.hasDPP()) { // If target doesn't support DPP operations, default to
+ // iterative stratergy
+
+ // To reduce the VGPR using iterative approach, we need to iterate
+ // over all the active lanes. Lowering consists of ComputeLoop,
+ // which iterate over only active lanes. We use copy of EXEC register
+ // as induction variable and every active lane modifies it using bitset0
+ // so that we will get the next active lane for next iteration.
+
+ // Create Control flow for loop
+ // Split MI's Machine Basic block into For loop
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
+
+ // Create initial values of induction variable from Exec, Accumulator and
+ // insert branch instr to newly created ComputeBlock
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+ if (is32BitOpc) {
+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+ .addImm(IdentityValue);
+ } else {
+ uint64_t IdentityValue =
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+ ? 0x0 // +0.0 for double sub reduction
+ : getIdentityValueFor64BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
+ IdentityValReg)
+ .addImm(IdentityValue);
+ }
+ // clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
- // clang-format on
-
- // Start constructing ComputeLoop
- I = ComputeLoop->begin();
- auto Accumulator =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(IdentityValReg)
- .addMBB(&BB);
- auto ActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(LoopIterator)
- .addMBB(&BB);
-
- I = ComputeLoop->end();
- MachineInstr *NewAccumulator;
- // Perform the computations
- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBitsReg);
- if (is32BitOpc) {
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
- LaneValueReg)
- .addReg(SrcReg)
- .addReg(FF1Reg);
- if (isFPOp) {
- Register LaneValVreg =
- MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
- Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
- // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
- LaneValVreg)
- .addReg(LaneValueReg);
- BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
- .addImm(0) // src0 modifier
- .addReg(Accumulator->getOperand(0).getReg())
- .addImm(0) // src1 modifier
- .addReg(LaneValVreg)
- .addImm(0) // clamp
- .addImm(0); // omod
- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(DstVreg);
+ // clang-format on
----------------
easyonaadit wrote:
It was added while the whole file was being formatted:
https://github.com/llvm/llvm-project/commit/c3fe0e46e2188fc94a64b51166d8b7e7694ed8c8#diff-e78d2fbd64648d787707fd3d4e7e5b5d2f00fb9c09972937718a11237933c597R4967
Not sure about the reasoning behind tho. I'll remove it. cc @shiltian
https://github.com/llvm/llvm-project/pull/185814
More information about the llvm-commits
mailing list