[llvm] [AMDGPU] DPP implementations for Wave Reduction (PR #185814)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 11 03:02:12 PDT 2026
================
@@ -5915,267 +5945,450 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
}
} else {
- // TODO: Implement DPP Strategy and switch based on immediate strategy
- // operand. For now, for all the cases (default, Iterative and DPP we use
- // iterative approach by default.)
-
- // To reduce the VGPR using iterative approach, we need to iterate
- // over all the active lanes. Lowering consists of ComputeLoop,
- // which iterate over only active lanes. We use copy of EXEC register
- // as induction variable and every active lane modifies it using bitset0
- // so that we will get the next active lane for next iteration.
MachineBasicBlock::iterator I = BB.end();
Register SrcReg = MI.getOperand(1).getReg();
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
-
- // Create Control flow for loop
- // Split MI's Machine Basic block into For loop
- auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
-
// Create virtual registers required for lowering.
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
- Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
- Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
-
+ const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
bool IsWave32 = ST.isWave32();
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-
- // Create initial values of induction variable from Exec, Accumulator and
- // insert branch instr to newly created ComputeBlock
- BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
- if (is32BitOpc) {
- uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
- .addImm(IdentityValue);
- } else {
- uint64_t IdentityValue =
- MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
- ? 0x0 // +0.0 for double sub reduction
- : getIdentityValueFor64BitWaveReduction(Opc);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
- .addImm(IdentityValue);
- }
- // clang-format off
+ if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
+ !ST.hasDPP()) { // If target doesn't support DPP operations, default to
+ // iterative stratergy
+
+ // To reduce the VGPR using iterative approach, we need to iterate
+ // over all the active lanes. Lowering consists of ComputeLoop,
+ // which iterate over only active lanes. We use copy of EXEC register
+ // as induction variable and every active lane modifies it using bitset0
+ // so that we will get the next active lane for next iteration.
+
+ // Create Control flow for loop
+ // Split MI's Machine Basic block into For loop
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
+
+ // Create initial values of induction variable from Exec, Accumulator and
+ // insert branch instr to newly created ComputeBlock
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+ if (is32BitOpc) {
+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+ .addImm(IdentityValue);
+ } else {
+ uint64_t IdentityValue =
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+ ? 0x0 // +0.0 for double sub reduction
+ : getIdentityValueFor64BitWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
+ IdentityValReg)
+ .addImm(IdentityValue);
+ }
+ // clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
- // clang-format on
-
- // Start constructing ComputeLoop
- I = ComputeLoop->begin();
- auto Accumulator =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
- .addReg(IdentityValReg)
- .addMBB(&BB);
- auto ActiveBits =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
- .addReg(LoopIterator)
- .addMBB(&BB);
-
- I = ComputeLoop->end();
- MachineInstr *NewAccumulator;
- // Perform the computations
- unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBitsReg);
- if (is32BitOpc) {
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
- LaneValueReg)
- .addReg(SrcReg)
- .addReg(FF1Reg);
- if (isFPOp) {
- Register LaneValVreg =
- MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
- Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
- // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
- LaneValVreg)
- .addReg(LaneValueReg);
- BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
- .addImm(0) // src0 modifier
- .addReg(Accumulator->getOperand(0).getReg())
- .addImm(0) // src1 modifier
- .addReg(LaneValVreg)
- .addImm(0) // clamp
- .addImm(0); // omod
- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
- .addReg(DstVreg);
+ // clang-format on
+
+ // Start constructing ComputeLoop
+ I = ComputeLoop->begin();
+ auto Accumulator =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+ .addReg(IdentityValReg)
+ .addMBB(&BB);
+ auto ActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+ .addReg(LoopIterator)
+ .addMBB(&BB);
+
+ I = ComputeLoop->end();
+ MachineInstr *NewAccumulator;
+ // Perform the computations
+ unsigned SFFOpc =
+ IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBitsReg);
+ if (is32BitOpc) {
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueReg)
+ .addReg(SrcReg)
+ .addReg(FF1Reg);
+ if (isFPOp) {
+ Register LaneValVreg =
+ MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
+ // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ LaneValVreg)
+ .addReg(LaneValueReg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+ .addImm(0) // src0 modifier
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addImm(0) // src1 modifier
+ .addReg(LaneValVreg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ NewAccumulator =
+ BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(DstVreg);
+ } else {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValueReg);
+ }
} else {
- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValueReg);
- }
- } else {
- Register LaneValueLoReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- Register LaneValueHiReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
- const TargetRegisterClass *SrcSubRC =
- TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
- MachineOperand Op1L = TII->buildExtractSubRegOrImm(
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
- MachineOperand Op1H = TII->buildExtractSubRegOrImm(
- MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
- // lane value input should be in an sgpr
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
- LaneValueLoReg)
- .add(Op1L)
- .addReg(FF1Reg);
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
- LaneValueHiReg)
- .add(Op1H)
- .addReg(FF1Reg);
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
- .addReg(LaneValueLoReg)
- .addImm(AMDGPU::sub0)
- .addReg(LaneValueHiReg)
- .addImm(AMDGPU::sub1);
- switch (Opc) {
- case AMDGPU::S_OR_B64:
- case AMDGPU::S_AND_B64:
- case AMDGPU::S_XOR_B64: {
- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg())
- .setOperandDead(3); // Dead scc
- break;
- }
- case AMDGPU::V_CMP_GT_I64_e64:
- case AMDGPU::V_CMP_GT_U64_e64:
- case AMDGPU::V_CMP_LT_I64_e64:
- case AMDGPU::V_CMP_LT_U64_e64: {
- Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
- Register ComparisonResultReg =
- MRI.createVirtualRegister(WaveMaskRegClass);
- int SrcIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
- const TargetRegisterClass *VregClass =
- TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
- const TargetRegisterClass *VSubRegClass =
- TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
- Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
- MachineOperand SrcReg0Sub0 =
- TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
- VregClass, AMDGPU::sub0, VSubRegClass);
- MachineOperand SrcReg0Sub1 =
- TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
- VregClass, AMDGPU::sub1, VSubRegClass);
- BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
- AccumulatorVReg)
- .add(SrcReg0Sub0)
- .addImm(AMDGPU::sub0)
- .add(SrcReg0Sub1)
- .addImm(AMDGPU::sub1);
- BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
- .addReg(LaneValue->getOperand(0).getReg())
- .addReg(AccumulatorVReg);
-
- unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
- .addReg(LaneMaskReg)
- .addReg(ActiveBitsReg);
-
- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::S_CSELECT_B64), DstReg)
- .addReg(LaneValue->getOperand(0).getReg())
- .addReg(Accumulator->getOperand(0).getReg());
- break;
- }
- case AMDGPU::V_MIN_F64_e64:
- case AMDGPU::V_MIN_NUM_F64_e64:
- case AMDGPU::V_MAX_F64_e64:
- case AMDGPU::V_MAX_NUM_F64_e64:
- case AMDGPU::V_ADD_F64_e64:
- case AMDGPU::V_ADD_F64_pseudo_e64: {
- int SrcIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
- const TargetRegisterClass *VregRC =
- TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
- const TargetRegisterClass *VregSubRC =
- TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
- Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
- Register DstVreg = MRI.createVirtualRegister(VregRC);
- Register LaneValLo =
+ Register LaneValueLoReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- Register LaneValHi =
+ Register LaneValueHiReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
- .addReg(Accumulator->getOperand(0).getReg());
- unsigned Modifier =
- MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
- ? SISrcMods::NEG
- : SISrcMods::NONE;
- auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
- .addImm(Modifier) // src0 modifiers
+ Register LaneValReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SrcSubRC =
+ TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
+ // lane value input should be in an sgpr
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueLoReg)
+ .add(Op1L)
+ .addReg(FF1Reg);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueHiReg)
+ .add(Op1H)
+ .addReg(FF1Reg);
+ auto LaneValue =
+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+ LaneValReg)
+ .addReg(LaneValueLoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValueHiReg)
+ .addImm(AMDGPU::sub1);
+ switch (Opc) {
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_XOR_B64: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg())
- .addImm(SISrcMods::NONE) // src1 modifiers
- .addReg(AccumulatorVReg)
- .addImm(SISrcMods::NONE) // clamp
- .addImm(SISrcMods::NONE); // omod
- auto ReadLaneLo =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- LaneValLo);
- auto ReadLaneHi =
- BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- LaneValHi);
- MachineBasicBlock::iterator Iters = *ReadLaneLo;
- MachineOperand Op1L =
- TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
- VregRC, AMDGPU::sub0, VregSubRC);
- MachineOperand Op1H =
- TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
- VregRC, AMDGPU::sub1, VregSubRC);
- ReadLaneLo.add(Op1L);
- ReadLaneHi.add(Op1H);
- NewAccumulator = BuildMI(*ComputeLoop, I, DL,
- TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
- .addReg(LaneValLo)
- .addImm(AMDGPU::sub0)
- .addReg(LaneValHi)
- .addImm(AMDGPU::sub1);
- break;
+ .setOperandDead(3); // Dead scc
+ break;
+ }
+ case AMDGPU::V_CMP_GT_I64_e64:
+ case AMDGPU::V_CMP_GT_U64_e64:
+ case AMDGPU::V_CMP_LT_I64_e64:
+ case AMDGPU::V_CMP_LT_U64_e64: {
+ Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register ComparisonResultReg =
+ MRI.createVirtualRegister(WaveMaskRegClass);
+ int SrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+ const TargetRegisterClass *VregClass =
+ TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+ const TargetRegisterClass *VSubRegClass =
+ TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
+ MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub0,
+ VSubRegClass);
+ MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub1,
+ VSubRegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
+ AccumulatorVReg)
+ .add(SrcReg0Sub0)
+ .addImm(AMDGPU::sub0)
+ .add(SrcReg0Sub1)
+ .addImm(AMDGPU::sub1);
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(AccumulatorVReg);
+
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
+ .addReg(LaneMaskReg)
+ .addReg(ActiveBitsReg);
+
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::S_CSELECT_B64), DstReg)
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addReg(Accumulator->getOperand(0).getReg());
+ break;
+ }
+ case AMDGPU::V_MIN_F64_e64:
+ case AMDGPU::V_MIN_NUM_F64_e64:
+ case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
+ case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64: {
+ int SrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
+ const TargetRegisterClass *VregRC =
+ TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
+ const TargetRegisterClass *VregSubRC =
+ TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
+ Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
+ Register DstVreg = MRI.createVirtualRegister(VregRC);
+ Register LaneValLo =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register LaneValHi =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
+ .addReg(Accumulator->getOperand(0).getReg());
+ unsigned Modifier =
+ MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
+ ? SISrcMods::NEG
+ : SISrcMods::NONE;
+ auto DstVregInst =
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
+ .addImm(Modifier) // src0 modifiers
+ .addReg(LaneValue->getOperand(0).getReg())
+ .addImm(SISrcMods::NONE) // src1 modifiers
+ .addReg(AccumulatorVReg)
+ .addImm(SISrcMods::NONE) // clamp
+ .addImm(SISrcMods::NONE); // omod
+ auto ReadLaneLo =
+ BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
+ auto ReadLaneHi =
+ BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
+ MachineBasicBlock::iterator Iters = *ReadLaneLo;
+ MachineOperand Op1L = TII->buildExtractSubRegOrImm(
+ Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0,
+ VregSubRC);
+ MachineOperand Op1H = TII->buildExtractSubRegOrImm(
+ Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1,
+ VregSubRC);
+ ReadLaneLo.add(Op1L);
+ ReadLaneHi.add(Op1H);
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL,
+ TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
+ .addReg(LaneValLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(LaneValHi)
+ .addImm(AMDGPU::sub1);
+ break;
+ }
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO: {
+ NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg());
+ ComputeLoop =
+ Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
+ break;
+ }
+ }
}
- case AMDGPU::S_ADD_U64_PSEUDO:
- case AMDGPU::S_SUB_U64_PSEUDO: {
- NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());
- ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
- break;
+ // Manipulate the iterator to get the next active lane
+ unsigned BITSETOpc =
+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1Reg)
+ .addReg(ActiveBitsReg);
+
+ // Add phi nodes
+ Accumulator.addReg(DstReg).addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
+
+ // Creating branching
+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ .addReg(NewActiveBitsReg)
+ .addImm(0);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(ComputeLoop);
+
+ RetBB = ComputeEnd;
+ } else {
+ assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
+
+ Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
+ Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
+ Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
+ Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
+ Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
+ Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
+ Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
+ Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
+ Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
+ Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
+ Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
+ Register FinalDPPResult;
+ Register UndefExec = MRI.createVirtualRegister(
+ IsWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+ : &AMDGPU::SReg_64_XEXECRegClass);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
+
+ uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
+ .addImm(IdentityValue);
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
+ .addReg(IdentitySGPR);
+
+ // Set inactive lanes to the identity value.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity)
+ .addImm(0) // src0 modifiers
+ .addReg(SrcReg) // src0
+ .addImm(0) // src1 modifiers
+ .addReg(IdentityVGPR) // identity value for inactive lanes
+ .addReg(UndefExec); // bool i1
+
+ unsigned DPPOpc = getDPPOpcForWaveReduction(Opc, ST);
+ auto GetVALUOpc = [](unsigned Opc) -> unsigned {
+ switch (Opc) {
+ case AMDGPU::S_MIN_U32:
+ return AMDGPU::V_MIN_U32_e64;
+ case AMDGPU::S_MIN_I32:
+ return AMDGPU::V_MIN_I32_e64;
+ case AMDGPU::S_MAX_U32:
+ return AMDGPU::V_MAX_U32_e64;
+ case AMDGPU::S_MAX_I32:
+ return AMDGPU::V_MAX_I32_e64;
+ case AMDGPU::S_ADD_I32:
+ return AMDGPU::V_ADD_I32_e64;
+ case AMDGPU::S_SUB_I32:
+ return AMDGPU::V_SUB_I32_e64;
+ case AMDGPU::S_AND_B32:
+ return AMDGPU::V_AND_B32_e64;
+ case AMDGPU::S_OR_B32:
+ return AMDGPU::V_OR_B32_e64;
+ case AMDGPU::S_XOR_B32:
+ return AMDGPU::V_XOR_B32_e64;
+ default:
+ return Opc;
+ }
+ };
+ auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
+ unsigned DPPCtrl, unsigned RowMask,
+ unsigned BankMask, unsigned BoundCtrl) {
+ BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst)
+ .addReg(Src) // old
+ .addReg(Src) // src0
+ .addReg(Src) // src1
+ .addImm(DPPCtrl) // dpp-ctrl
+ .addImm(RowMask) // row-mask
+ .addImm(BankMask) // bank-mask
+ .addImm(BoundCtrl); // bound-control
----------------
easyonaadit wrote:
I kept it that way for flexibility. But you're right, these values won't be changing for reduction operations.
https://github.com/llvm/llvm-project/pull/185814
More information about the llvm-commits
mailing list