[llvm] reduce over divergent mask (PR #133228)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 27 03:27:00 PDT 2025
github-actions[bot] wrote:
<!--LLVM CODE FORMAT COMMENT: {clang-format}-->
:warning: C/C++ code formatter, clang-format found issues in your code. :warning:
<details>
<summary>
You can test this locally with the following command:
</summary>
``````````bash
git-clang-format --diff 78408fddccf34b7d79eb655fa2cb4dfacdfb8ae3 c5c1cc54524d839f148c1390b659772fcabc0a4a --extensions cpp -- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
``````````
</details>
<details>
<summary>
View the diff from clang-format here.
</summary>
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bca356327..2578619ab3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4980,11 +4980,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
Register DstReg = MI.getOperand(0).getReg();
llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
Register MaskReg = MI.getOperand(2).getReg();
- llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
+ llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg))
+ << "\n";
- // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg)) << "\n";
- // llvm::errs() << "DstReg:" << MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n";
- // llvm::errs() << "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
+ // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg))
+ // << "\n"; llvm::errs() << "DstReg:" <<
+ // MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n"; llvm::errs() <<
+ // "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
MachineBasicBlock *RetBB = nullptr;
if (isSGPR) {
// These operations with a uniform value i.e. SGPR are idempotent.
@@ -5015,9 +5017,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
- Register InitalValReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
+ Register InitalValReg =
+ MRI.createVirtualRegister(DstRegClass); // MRI.getRegClass(SrcReg)
- Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
+ Register AccumulatorReg =
+ MRI.createVirtualRegister(DstRegClass); // MRI.getRegClass(SrcReg)
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
Register TempRegMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5037,12 +5041,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// insert branch instr to newly created ComputeBlockk
uint32_t InitalValue =
(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
- auto TmpSReg =
- BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); //s_mov_b64 s[2:3], exec
+ auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator)
+ .addReg(ExecReg); // s_mov_b64 s[2:3], exec
// auto TmpMaskSReg =
- // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg); //s_mov_b64 s[2:3], exec
+ // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg);
+ // //s_mov_b64 s[2:3], exec
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
- .addImm(InitalValue);//s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
+ .addImm(InitalValue); // s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
// clang-format off
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(ComputeLoop);
@@ -5061,21 +5066,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
// Perform the computations
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
- auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
- .addReg(ActiveBits->getOperand(0).getReg());//%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
- auto LaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
- .addReg(SrcReg)
- .addReg(FF1->getOperand(0).getReg());//%value_at_lane_index.sreg = V_READLANE %value.vgpr %index.sgpr
- auto MaskLaneValue = BuildMI(*ComputeLoop, I, DL,
- TII->get(AMDGPU::V_READLANE_B32), MaskLaneValueReg)
- .addReg(MaskReg)
- .addReg(FF1->getOperand(0).getReg());//%mask_at_lane_index.sreg = V_READLANE %mask.vgpr %index.sgpr
- auto FF2 = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
- .addReg(MaskLaneValue->getOperand(0).getReg());//%subgroupindex.sgpr = S_FF1_I32_B64 %mask_at_lane_index.sreg
- auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
- .addReg(Accumulator->getOperand(0).getReg())
- .addReg(LaneValue->getOperand(0).getReg());//%acc.sgpr = max %acc.sgpr %value_at_lane_index.sreg
+ auto FF1 =
+ BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(
+ ActiveBits->getOperand(0)
+ .getReg()); //%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
+ auto LaneValue =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ LaneValueReg)
+ .addReg(SrcReg)
+ .addReg(FF1->getOperand(0)
+ .getReg()); //%value_at_lane_index.sreg = V_READLANE
+ //%value.vgpr %index.sgpr
+ auto MaskLaneValue =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+ MaskLaneValueReg)
+ .addReg(MaskReg)
+ .addReg(FF1->getOperand(0)
+ .getReg()); //%mask_at_lane_index.sreg = V_READLANE
+ //%mask.vgpr %index.sgpr
+ auto FF2 =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
+ .addReg(MaskLaneValue->getOperand(0)
+ .getReg()); //%subgroupindex.sgpr = S_FF1_I32_B64
+ //%mask_at_lane_index.sreg
+ auto NewAccumulator =
+ BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(
+ LaneValue->getOperand(0).getReg()); //%acc.sgpr = max %acc.sgpr
+ //%value_at_lane_index.sreg
// Manipulate the iterator to get the next active lane
unsigned BITSETOpc =
@@ -5083,7 +5103,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
auto NewActiveBits =
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
.addReg(FF1->getOperand(0).getReg())
- .addReg(ActiveBits->getOperand(0).getReg());//%bitsetresult = S_BITSET0_B64 %exec_copy
+ .addReg(ActiveBits->getOperand(0)
+ .getReg()); //%bitsetresult = S_BITSET0_B64 %exec_copy
// Add phi nodes
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
``````````
</details>
https://github.com/llvm/llvm-project/pull/133228
More information about the llvm-commits
mailing list