[llvm] [amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic (PR #105822)

Tue Aug 27 11:56:44 PDT 2024

================
@@ -15671,6 +15671,133 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
   }
 }
 
+static void removeInitWholeWaveBranch(MachineFunction &MF,
+                                      MachineRegisterInfo &MRI,
+                                      const SIInstrInfo *TII) {
+  // Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
+  // EXEC to -1 at SI_END_CF.
+  auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
+    return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
+  });
+  if (IWWIt == MF.begin()->instr_end())
+    return; // We've been here before (GISel runs finalizeLowering twice).
+
+  MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
+  assert(If.getOpcode() == AMDGPU::SI_IF &&
+         "Unexpected user for init.whole.wave result");
+  assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
+         "Expected simple control flow");
+
+  MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
+  MachineBasicBlock *EndBB = EndCf.getParent();
+
+  // Update all the Phis: since we're removing a predecessor, we need to remove
+  // the corresponding pair of operands. However, we can't just drop the value
+  // coming from the 'if' block - that's going to be the value of the inactive
+  // lanes.
+  // %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
+  // should become
+  // %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
+  // %v = v_set_inactive %t, %inactive
+  // Note that usually EndCf will be the first instruction after the phis and as
+  // such will serve as the end of the range when iterating over phis.
+  // Therefore, we shouldn't introduce any new instructions before it.
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  auto AfterEndCf = std::next(EndCf.getIterator());
+  for (auto &Phi : EndBB->phis()) {
+    Register PhiDest = Phi.getOperand(0).getReg();
+    const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
+
+    Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
+    Phi.getOperand(0).setReg(NewPhiDest);
+
+    unsigned InactiveOpIdx = 0;
+    for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
+      if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
+        InactiveOpIdx = I;
+        break;
+      }
+    }
+    assert(InactiveOpIdx != 0 && "Broken phi?");
+
+    // At this point, the register class could be larger than 32 or 64, so we
+    // might have to use more than one V_SET_INACTIVE instruction.
+    unsigned Size = TRI.getRegSizeInBits(*PhiRC);
+    switch (Size) {
+    case 32:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    case 64:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    default: {
+      // For each 32-bit subregister of the register at InactiveOpIdx, insert
+      // a COPY to a new register, and a V_SET_INACTIVE_B32 using the
+      // corresponding subregisters of PhiDest and NewPhiDest.
+      // FIXME: There has to be a better way to iterate over this...
----------------
arsenm wrote:

Is this just getSubRegFromChannel? 

https://github.com/llvm/llvm-project/pull/105822