[llvm] unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)

Thu Sep 11 07:14:32 PDT 2025

================
@@ -417,6 +455,233 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
+                                                 const MachineInstr &SuccMI) {
+  for (const MachineOperand &Pred_Ops : PredMI.operands()) {
+    if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+      continue;
+    Register Pred_Reg = Pred_Ops.getReg();
+    if (!Pred_Reg.isValid())
+      continue;
+    for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
+      if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+        continue;
+      Register Succ_Reg = Succ_Ops.getReg();
+      if (!Succ_Reg.isValid())
+        continue;
+      if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
+  // Use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI,
+                                          unsigned Src_Mods,
+                                          unsigned NegModifier,
+                                          unsigned OpSelModifier,
+                                          MachineOperand &SrcMO) {
+  unsigned New_Src_Mods = 0;
+  const TargetRegisterInfo *RI = SrcMO.getParent()
+                                     ->getParent()
+                                     ->getParent()
+                                     ->getSubtarget()
+                                     .getRegisterInfo();
+  //  If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+  //  lane.
+  //  NEG_HI shares the same bit position with ABS. But packed instructions do
+  //  not support ABS. Therefore, NEG_HI must be translated to NEG source
+  //  modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+  //  ABS, therefore we need to explicitly add the NEG modifier if present in
+  //  the packed instruction
+  if (Src_Mods & NegModifier) {
+    New_Src_Mods |= SISrcMods::NEG;
+  }
+  // Src modifiers. Only negative modifiers are added if needed. Unpacked
+  // operations do not have op_sel, therefore it must be handled explicitly as
+  // done below. Unpacked operations support abs, but packed instructions do
+  // not. Thus, abs is not handled.
+  NewMI.addImm(New_Src_Mods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+  } else {
+    // If op_sel == 0, select register 0 of reg:sub0_sub1
+    Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
+                                  ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+                                  : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+    if (SrcMO.isReg() && SrcMO.isKill())
+      NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+    else
+      NewMI.addReg(UnpackedSrcReg);
+  }
+}
+
+void SIPreEmitPeephole::createListOfPackedInstr(
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+    uint16_t NumMFMACycles) {
+  auto *BB = BeginMI.getParent();
+  auto E = BB->end();
+  int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *InstrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+    if (Instr.isMetaInstruction())
+      continue;
+    if (Instr.isTerminator())
+      return;
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
+      return;
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+      if (hasReadWriteDependencies(BeginMI, Instr))
+        continue;
+
+      // If it is a packed instruction, we should subtract it's latency from the
+      // overall latency calculation here, because the packed instruction will
+      // be removed and replaced by 2 unpacked instructions
+      TotalCyclesBetweenCandidates -=
+          SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+      // We're adding 2 to account for the extra latency added by unpacking into
+      // 2 instructions. At the time of writing, the considered unpacked
+      // instructions have latency of 1.
+      // TODO: improve latency handling of possible inserted instructions
+      TotalCyclesBetweenCandidates += 2;
+      // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) {
----------------
jplehr wrote:

Remove leftover commented-out code

https://github.com/llvm/llvm-project/pull/157968