[llvm] unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)

Thu Sep 18 12:19:30 PDT 2025

================
@@ -417,6 +454,281 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
   return true;
 }
 
+// If support is extended to new operations, add tests in
+// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  if (!TII->isNeverCoissue(MI))
+    return false;
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::canUnpackingClobberRegister(
+    const MachineInstr &MI) {
+  unsigned OpCode = MI.getOpcode();
+  Register DstReg = MI.getOperand(0).getReg();
+  // Only the first register in the register pair needs to be checked due to the
+  // unpacking order. Packed instructions are unpacked such that the lower 32
+  // bits (i.e., the first register in the pair) are written first. This can
+  // introduce dependencies if the first register is written in one instruction
+  // and then read as part of the higher 32 bits in the subsequent instruction.
+  // Such scenarios can arise due to specific combinations of op_sel and
+  // op_sel_hi modifiers.
+  Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
+
+  const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  if (Src0MO && Src0MO->isReg()) {
+    Register SrcReg0 = Src0MO->getReg();
+    unsigned Src0Mods =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
+    Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+                             ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+                             : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+    // Check if the register selected by op_sel_hi is the same as the first
+    // register in the destination register pair.
+    if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
+      return true;
+  }
+
+  const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1MO && Src1MO->isReg()) {
+    Register SrcReg1 = Src1MO->getReg();
+    unsigned Src1Mods =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
+    Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
+                             ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
+                             : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+    if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+      return true;
+  }
+
+  // Applicable for packed instructions with 3 source operands, such as
+  // V_PK_FMA.
+  if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+    const MachineOperand *Src2MO =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    if (Src2MO && Src2MO->isReg()) {
+      Register SrcReg2 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+      unsigned Src2Mods =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
+      Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+                               ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+                               : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+      if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
+  // Use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 e64 instructions allow source modifiers
+  // e32 instructions don't allow source modifiers.
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
+  }
+  llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
+                                          unsigned SrcMods, bool IsHiBits,
+                                          const MachineOperand &SrcMO) {
+  unsigned NewSrcMods = 0;
+  unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+  unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+  // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
+  // for ABS modifiers.
+  // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+  // lane.
+  // NEG_HI shares the same bit position with ABS. But packed instructions do
+  // not support ABS. Therefore, NEG_HI must be translated to NEG source
+  // modifier for the higher 32 bits. Unpacked VOP3 instructions support
+  // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
+  // NEG modifier if present in the packed instruction.
+  if (SrcMods & NegModifier)
+    NewSrcMods |= SISrcMods::NEG;
+  // Src modifiers. Only negative modifiers are added if needed. Unpacked
+  // operations do not have op_sel, therefore it must be handled explicitly as
+  // done below.
+  NewMI.addImm(NewSrcMods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+    return;
+  }
+  // If op_sel == 0, select register 0 of reg:sub0_sub1.
+  Register UnpackedSrcReg = (SrcMods & OpSelModifier)
+                                ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+                                : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+
+  MachineOperand UnpackedSrcMO =
+      MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
+  if (SrcMO.isKill()) {
+    // For each unpacked instruction, mark its source registers as killed if the
+    // corresponding source register in the original packed instruction was
+    // marked as killed.
+    //
+    // Exception:
+    // If the op_sel and op_sel_hi modifiers require both unpacked instructions
+    // to use the same register (e.g., due to overlapping access to low/high
+    // bits of the same packed register), then only the *second* (latter)
+    // instruction should mark the register as killed. This is because the
+    // second instruction handles the higher bits and is effectively the last
+    // user of the full register pair.
+
+    bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
+    bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
+    bool KillState = true;
+    if ((OpSel == OpSelHi) && !IsHiBits)
+      KillState = false;
+    UnpackedSrcMO.setIsKill(KillState);
+  }
+  NewMI.add(UnpackedSrcMO);
+}
+
+void SIPreEmitPeephole::collectUnpackingCandidates(
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+    uint16_t NumMFMACycles) {
+  auto *BB = BeginMI.getParent();
+  auto E = BB->end();
+  int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  Register MFMADef = BeginMI.getOperand(0).getReg();
+
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    if (Instr.isMetaInstruction())
+      continue;
+    if ((Instr.isTerminator()) ||
+        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (SIInstrInfo::modifiesModeRegister(Instr) &&
+         Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
+      return;
+
+    const MCSchedClassDesc *InstrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    uint16_t Latency =
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+    TotalCyclesBetweenCandidates += Latency;
+
+    if (TotalCyclesBetweenCandidates > NumMFMACycles - 1)
+      return;
+    // Identify register dependencies between those used by the MFMA
+    // instruction and the following packed instructions. Also checks for
+    // transitive dependencies between the MFMA def and candidate instruction
+    // def and uses. Conservatively ensures that we do not incorrectly
+    // read/write registers.
+    for (const MachineOperand &InstrMO : Instr.operands()) {
+      if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
+        continue;
+      if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
+        return;
+    }
+    if (!isUnpackingSupportedInstr(Instr))
+      continue;
+
+    if (canUnpackingClobberRegister(Instr))
+      return;
+    // If it's a packed instruction, adjust latency: remove the packed
+    // latency, add latency of two unpacked instructions (currently estimated
+    // as 2 cycles).
+    TotalCyclesBetweenCandidates -= Latency;
+    // TODO: improve latency handling based on instruction modeling.
+    TotalCyclesBetweenCandidates += 2;
+    // Subtract 1 to account for MFMA issue latency.
+    if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
+      InstrsToUnpack.insert(&Instr);
+  }
+  return;
+}
+
+void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
+  MachineOperand DstOp = I.getOperand(0);
+
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
----------------
jrbyrnes wrote:

We should not be failing to unpack at this point -- unless there is a circumstance which we can't account for earlier.

If we want this check, it should be done during collection time, but I think it is redundant with `isUnpackingSupportedInstr` .

I think this should just be an assert.

https://github.com/llvm/llvm-project/pull/157968