[llvm] AMDGPU/GlobalISel: Fix inst-selection of ballot (PR #109986)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 30 05:07:50 PDT 2024
================
@@ -1429,34 +1429,129 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
- const auto BuildCopy = [&](Register SrcReg) {
- if (Size == STI.getWavefrontSize()) {
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(SrcReg);
- return;
+ const auto getCmpInput = [&]() -> MachineInstr * {
+ MachineInstr *SrcMI = getDefIgnoringCopies(I.getOperand(2).getReg(), *MRI);
+ // Try to fold sgpr compare.
+ if (SrcMI->getOpcode() == AMDGPU::G_TRUNC)
+ SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+
+ if (SrcMI->getOpcode() == AMDGPU::G_ICMP ||
+ SrcMI->getOpcode() == AMDGPU::G_FCMP)
+ return SrcMI;
+ return nullptr;
+ };
+
+ const auto FoldCmp = [&](Register Dst, MachineInstr *CmpMI) -> bool {
+ // Fold ballot of a compare. Active lanes when the ballot is executed need
+ // to also be active when the compare is executed for this fold to be
+ // correct. If an inactive lane on compare becomes active for the ballot,
+ // divergent control flow is involved. The compare was in a divergent branch
+ // and needs to go through phi before being used by the ballot. The ballot
+ // is in a block that merged control flow. Using the compare directly in the
+ // ballot implies that active lanes for the ballot are a subset of active
+ // lanes for the compare.
+ auto Pred = (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate();
+ Register Op0 = CmpMI->getOperand(2).getReg();
+ Register Op1 = CmpMI->getOperand(3).getReg();
+ unsigned OpSize = MRI->getType(Op0).getSizeInBits();
+ const TargetRegisterClass *VgprRC = TRI.getVGPRClassForBitWidth(OpSize);
+
+ int CmpOpcode = getV_CMPOpcode(Pred, OpSize, *Subtarget);
+ if (CmpOpcode == -1)
+ return false;
+
+ MachineInstr *Cmp;
+ unsigned Op0Idx, Op1Idx;
+ if (CmpMI->getOpcode() == AMDGPU::G_ICMP) {
+ Cmp =
+ BuildMI(*BB, &I, DL, TII.get(CmpOpcode), Dst).addReg(Op0).addReg(Op1);
+ Op0Idx = 1;
+ Op1Idx = 2;
+ } else {
+ // fcmp compares have modifiers
+ Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpcode), Dst)
+ .addImm(0)
+ .addReg(Op0)
+ .addImm(0)
+ .addReg(Op1)
+ .addImm(0);
+ Op0Idx = 2;
+ Op1Idx = 4;
}
- // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
- Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(SrcReg)
+ return constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, *Cmp, *VgprRC,
+ Cmp->getOperand(Op0Idx)) &&
+ constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, *Cmp, *VgprRC,
+ Cmp->getOperand(Op1Idx)) &&
+ constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
----------------
arsenm wrote:
The constrainSelectedInstRegOperands is redundant with the two constrainOperandRegClass calls. Either directly constrain the result register, or just use the one constrainSelectedInstRegOperands
https://github.com/llvm/llvm-project/pull/109986
More information about the llvm-commits
mailing list