[llvm] unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 11 10:27:12 PDT 2025
================
@@ -417,6 +458,230 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_FMA_F32:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI) {
+ for (const MachineOperand &PredOps : PredMI.operands()) {
+ if (!PredOps.isReg() || !PredOps.isDef())
+ continue;
+ Register PredReg = PredOps.getReg();
+ if (!PredReg.isValid())
+ continue;
+ for (const MachineOperand &SuccOps : SuccMI.operands()) {
+ if (!SuccOps.isReg() || !SuccOps.isDef())
+ continue;
+ Register SuccReg = SuccOps.getReg();
+ if (!SuccReg.isValid())
+ continue;
+ if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg))
+ return true;
+ }
+ }
+ return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+ unsigned Opcode = I.getOpcode();
+ // Use 64 bit encoding to allow use of VOP3 instructions.
+ // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+ // e32 instructions are VOP2 and don't allow source modifiers
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::V_PK_MUL_F32:
+ return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::V_PK_FMA_F32:
+ return AMDGPU::V_FMA_F32_e64;
+ default:
+ return std::numeric_limits<uint16_t>::max();
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
+ unsigned Src_Mods,
+ unsigned NegModifier,
+ unsigned OpSelModifier,
+ MachineOperand &SrcMO) {
+ unsigned NewSrcMods = 0;
+ const TargetRegisterInfo *RI = SrcMO.getParent()
+ ->getParent()
+ ->getParent()
+ ->getSubtarget()
+ .getRegisterInfo();
+ // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+ // lane.
+ // NEG_HI shares the same bit position with ABS. But packed instructions do
+ // not support ABS. Therefore, NEG_HI must be translated to NEG source
+ // modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+ // ABS, therefore we need to explicitly add the NEG modifier if present in
+ // the packed instruction
+ if (Src_Mods & NegModifier)
+ NewSrcMods |= SISrcMods::NEG;
+ // Src modifiers. Only negative modifiers are added if needed. Unpacked
+ // operations do not have op_sel, therefore it must be handled explicitly as
+ // done below. Unpacked operations support abs, but packed instructions do
+ // not. Thus, abs is not handled.
+ NewMI.addImm(NewSrcMods);
+ if (SrcMO.isImm()) {
+ NewMI.addImm(SrcMO.getImm());
+ return;
+ }
+ // If op_sel == 0, select register 0 of reg:sub0_sub1
+ Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
+ ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+ : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+ if (SrcMO.isReg() && SrcMO.isKill())
+ NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+ else
+ NewMI.addReg(UnpackedSrcReg);
+}
+
+void SIPreEmitPeephole::selectSuitableInstrsForUnpacking(
+ MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles) {
+ auto *BB = BeginMI.getParent();
+ auto E = BB->end();
+ int TotalCyclesBetweenCandidates = 0;
+ auto SchedModel = TII->getSchedModel();
+ for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+ MachineInstr &Instr = *I;
+ const MCSchedClassDesc *InstrSchedClassDesc =
+ SchedModel.resolveSchedClass(&Instr);
+ TotalCyclesBetweenCandidates +=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+ if (Instr.isMetaInstruction())
+ continue;
+ if (Instr.isTerminator())
+ return;
+ if (TotalCyclesBetweenCandidates > NumMFMACycles)
+ return;
+ if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+ if (hasReadWriteDependencies(BeginMI, Instr))
+ continue;
----------------
jrbyrnes wrote:
Should we return here? If there is a dependency between `BeginMI` and `Instr` then Instr needs to wait for `BeginMI` to be done executing -- we shouldn't be unpacking instructions after `Instr`.
https://github.com/llvm/llvm-project/pull/157968
More information about the llvm-commits
mailing list