[llvm] [AMDGPU] Extending wave reduction intrinsics for `i64` types - 1 (PR #150169)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 19 08:28:46 PDT 2025
================
@@ -5222,73 +5237,164 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
-
+ Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
-
- Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
- Register LaneValueReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
bool IsWave32 = ST.isWave32();
- unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
// Create initial values of induction variable from Exec, Accumulator and
// insert branch instr to newly created ComputeBlock
- uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
- BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);
+ uint32_t IdentityValue = getIdentityValueForWaveReduction(Opc);
+ BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
+ if (is32BitOpc) {
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
+ .addImm(IdentityValue);
+ } else {
+ Register Identitylo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Identityhi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identityhi)
+ .addImm(IdentityValue);
+ switch (Opc) {
+ case AMDGPU::V_CMP_LT_U64_e64:
+ case AMDGPU::V_CMP_LT_I64_e64:
+ IdentityValue = int32_t(-1); // u|min
+ break;
+ case AMDGPU::V_CMP_GT_U64_e64:
+ case AMDGPU::V_CMP_GT_I64_e64:
+ IdentityValue = int32_t(0); // u|max
+ break;
+ }
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), Identitylo)
+ .addImm(IdentityValue);
+ BuildMI(BB, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), IdentityValReg)
+ .addReg(Identitylo)
+ .addImm(AMDGPU::sub0)
+ .addReg(Identityhi)
+ .addImm(AMDGPU::sub1);
----------------
arsenm wrote:
That's why you need to use S_MOV_B64_IMM_PSEUDO if it's not an inline immediate
https://github.com/llvm/llvm-project/pull/150169
More information about the llvm-commits
mailing list