[llvm] AMDGPU/GlobalISel: Fix inst-selection of ballot (PR #109986)

Fri Oct 4 06:17:18 PDT 2024

================
@@ -1429,34 +1432,131 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
   std::optional<ValueAndVReg> Arg =
       getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
 
-  const auto BuildCopy = [&](Register SrcReg) {
-    if (Size == STI.getWavefrontSize()) {
-      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
-          .addReg(SrcReg);
-      return;
+  const auto getCmpInput = [&]() -> MachineInstr * {
+    MachineInstr *SrcMI = getDefIgnoringCopies(I.getOperand(2).getReg(), *MRI);
+    // Try to fold sgpr compare.
+    if (SrcMI->getOpcode() == AMDGPU::G_TRUNC)
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+
+    if (SrcMI->getOpcode() == AMDGPU::G_ICMP ||
+        SrcMI->getOpcode() == AMDGPU::G_FCMP)
+      return SrcMI;
+    return nullptr;
+  };
+
+  const auto FoldCmp = [&](Register Dst, MachineInstr *CmpMI) -> bool {
+    // Fold ballot of a compare. Active lanes when the ballot is executed need
+    // to also be active when the compare is executed for this fold to be
+    // correct. If an inactive lane on compare becomes active for the ballot,
+    // divergent control flow is involved. The compare is in a divergent branch
+    // and needs to go through phi before being used by the ballot, the ballot
+    // is in a block that merged control flow. Using the compare directly in the
+    // ballot implies that active lanes for the ballot are a subset of active
+    // lanes for the compare.
+    auto Pred = cast<GAnyCmp>(CmpMI)->getCond();
+    Register Src0Reg = CmpMI->getOperand(2).getReg();
+    Register Src1Reg = CmpMI->getOperand(3).getReg();
+    unsigned OpSize = MRI->getType(Src0Reg).getSizeInBits();
+
+    int CmpOpcode = getV_CMPOpcode(Pred, OpSize, *Subtarget);
+    if (CmpOpcode == -1)
+      return false;
+
+    const auto constrainToVGPR = [&](Register Reg,
+                                     MachineInstr *InsertPt) -> Register {
+      if (RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID)
+        return Reg;
+      Register VgprReg =
+          MRI->createVirtualRegister(TRI.getVGPRClassForBitWidth(OpSize));
+      BuildMI(*BB, InsertPt, DL, TII.get(AMDGPU::COPY), VgprReg).addReg(Reg);
+      return VgprReg;
+    };
+
+    MachineInstrBuilder Cmp;
+    if (CmpMI->getOpcode() == AMDGPU::G_ICMP) {
+      Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpcode), Dst);
+      Cmp.addReg(constrainToVGPR(Src0Reg, Cmp))
+          .addReg(constrainToVGPR(Src1Reg, Cmp));
+    } else {
+      auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src0Reg);
+      auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src1Reg);
----------------
arsenm wrote:

Need tests with source modifiers? 

https://github.com/llvm/llvm-project/pull/109986