[llvm] unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)

Sun Sep 14 13:36:53 PDT 2025

================
@@ -417,6 +460,261 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI,
+                                          const MachineInstr &SuccMI) {
+  for (const MachineOperand &PredOps : PredMI.operands()) {
+    if (!PredOps.isReg() || !PredOps.isDef())
+      continue;
+    Register PredReg = PredOps.getReg();
+    if (!PredReg.isValid())
+      continue;
+    for (const MachineOperand &SuccOps : SuccMI.operands()) {
+      if (!SuccOps.isReg())
+        continue;
+      Register SuccReg = SuccOps.getReg();
+      if (!SuccReg.isValid())
+        continue;
+      if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
+    const MachineInstr &MI) {
+  unsigned OpCode = MI.getOpcode();
+  bool IsFMA = OpCode == AMDGPU::V_PK_FMA_F32;
+  MachineOperand DstMO = MI.getOperand(0);
+  Register DstReg = DstMO.getReg();
+  Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+  Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+
+  Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
+  int Src0ModifiersIdx =
+      AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
+  int Src1ModifiersIdx =
+      AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
+  unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm();
+  unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm();
+
+  Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+                           ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+                           : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+  Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
+                           ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
+                           : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+  if (UnpackedDstReg == HiSrc0Reg ||
+      TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg) ||
+      UnpackedDstReg == HiSrc1Reg ||
+      TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+    return true;
+  if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+    int Src2ModifiersIdx =
+        AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
+    unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm();
+    Register SrcReg2 = MI.getOperand(6).getReg();
+    Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+                             ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+                             : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+    if (UnpackedDstReg == HiSrc2Reg ||
+        TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+      return true;
+  }
+  return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
+  // Use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 e64 instructions allow source modifiers
+  // e32 instructions don't allow source modifiers
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
+                                          unsigned SrcMods,
+                                          unsigned NegModifier,
+                                          unsigned OpSelModifier,
+                                          const MachineOperand &SrcMO) {
+  unsigned NewSrcMods = 0;
+  //  If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+  //  lane.
+  //  NEG_HI shares the same bit position with ABS. But packed instructions do
+  //  not support ABS. Therefore, NEG_HI must be translated to NEG source
+  //  modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+  //  ABS, therefore we need to explicitly add the NEG modifier if present in
+  //  the packed instruction
+  if (SrcMods & NegModifier)
+    NewSrcMods |= SISrcMods::NEG;
+  // Src modifiers. Only negative modifiers are added if needed. Unpacked
+  // operations do not have op_sel, therefore it must be handled explicitly as
+  // done below. Unpacked operations support abs, but packed instructions do
+  // not. Thus, abs is not handled.
+  NewMI.addImm(NewSrcMods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+    return;
+  }
+  // If op_sel == 0, select register 0 of reg:sub0_sub1
+  Register UnpackedSrcReg = (SrcMods & OpSelModifier)
+                                ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+                                : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+  if (SrcMO.isReg() && SrcMO.isKill())
+    NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+  else
+    NewMI.addReg(UnpackedSrcReg);
+}
+
+void SIPreEmitPeephole::collectUnpackingCandidates(
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+    uint16_t NumMFMACycles) {
+  auto *BB = BeginMI.getParent();
+  auto E = BB->end();
+  int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    if (Instr.isMetaInstruction())
+      continue;
+    if (Instr.isTerminator())
+      return;
+    if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr))
+      return;
+    const MCSchedClassDesc *InstrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
+      return;
+    if (isUnpackingSupportedInstr(Instr)) {
+      assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
+      if (hasRWDependencies(BeginMI, Instr))
+        return;
+      if (canUnpackingIntroduceDependencies(Instr))
+        return;
+      // If it is a packed instruction, we should subtract it's latency from the
+      // overall latency calculation here, because the packed instruction will
+      // be removed and replaced by 2 unpacked instructions
+      TotalCyclesBetweenCandidates -=
+          SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+      // We're adding 2 to account for the extra latency added by unpacking into
+      // 2 instructions. At the time of writing, the considered unpacked
+      // instructions have latency of 1.
+      // TODO: improve latency handling of possible inserted instructions
+      TotalCyclesBetweenCandidates += 2;
+      if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
+        InstrsToUnpack.insert(&Instr);
+    }
+  }
+  return;
+}
+
+void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
+  MachineOperand DstOp = I.getOperand(0);
+
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+
+  MachineInstrBuilder Op0LOp1L =
+      createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
+  MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
+
+  if (DstOp.isUndef())
+    LoDstOp.setIsUndef();
+
+  MachineInstrBuilder Op0HOp1H =
+      createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
+  MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+    Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
+    Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
+  }
+  if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) {
+    LoDstOp.setIsRenamable(true);
+    HiDstOp.setIsRenamable(true);
+  }
+
+  I.eraseFromParent();
+  return;
+}
+
+MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
+                                                        uint16_t UnpackedOpcode,
+                                                        bool IsHiBits) {
+  MachineBasicBlock &MBB = *I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  MachineOperand &DstMO = I.getOperand(0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  Register DstReg = DstMO.getReg();
+  unsigned OpCode = I.getOpcode();
+  Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
+                                     : TRI->getSubReg(DstReg, AMDGPU::sub0);
+
+  int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp);
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+  int Src0ModifiersIdx =
+      AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
+  int Src1ModifiersIdx =
+      AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
+
+  unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm();
+  unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm();
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+
+  unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
----------------
jrbyrnes wrote:

Can you just pass the IsHiBits to `addOperandsAndMods` and sink this handling into that function.

https://github.com/llvm/llvm-project/pull/157968