[llvm] Co-issue packed instructions by unpacking (PR #151704)

Wed Aug 6 14:24:40 PDT 2025

https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/151704

>From 7c443285ec2df426ca1ff93f236fcb67d735338f Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:40:34 -0500
Subject: [PATCH 1/6] initial commit

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 409 ++++++++++++++++++
 1 file changed, 409 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..d76502d18f7e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,6 +39,21 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
+#include "AMDGPURegisterBankInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"
+#include <unordered_set>
+
+#include "GCNSchedStrategy.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -53,6 +68,17 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  bool unpackInsts(MachineFunction &MF);
+  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
+  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+  void insertMI(MachineInstr &I);
+  SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
+                                                       unsigned SGPRSrcPos);
+  SmallVector<MachineInstr *, 2>
+  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+                   MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+                   bool isVreg_64);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -62,6 +88,7 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
+  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -75,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -225,10 +253,390 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
+bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
+  // if (!IsGFX942Only)
+  //   return false;
+
+  if (!SIInstrInfo::isVALU(MI)){
+    return false;
+  }
+
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (SIInstrInfo::isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (SIInstrInfo::isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (SIInstrInfo::isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_CVT_PK_BF8_F32_e64:
+  case AMDGPU::V_CVT_PK_FP8_F32_e64:
+  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+  case AMDGPU::V_MQSAD_U32_U8_e64:
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_ADD_I16:
+  case AMDGPU::V_PK_ADD_U16:
+  case AMDGPU::V_PK_ASHRREV_I16:
+  case AMDGPU::V_PK_FMA_F16:
+  case AMDGPU::V_PK_FMA_F32:
+  case AMDGPU::V_PK_FMAC_F16_e32:
+  case AMDGPU::V_PK_FMAC_F16_e64:
+  case AMDGPU::V_PK_LSHLREV_B16:
+  case AMDGPU::V_PK_LSHRREV_B16:
+  case AMDGPU::V_PK_MAD_I16:
+  case AMDGPU::V_PK_MAD_U16:
+  case AMDGPU::V_PK_MAX_F16:
+  case AMDGPU::V_PK_MAX_I16:
+  case AMDGPU::V_PK_MAX_U16:
+  case AMDGPU::V_PK_MIN_F16:
+  case AMDGPU::V_PK_MIN_I16:
+  case AMDGPU::V_PK_MIN_U16:
+  case AMDGPU::V_PK_MOV_B32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_LO_U16:
+  case AMDGPU::V_PK_SUB_I16:
+  case AMDGPU::V_PK_SUB_U16:
+  case AMDGPU::V_QSAD_PK_U16_U8_e64:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+SmallVector<MachineInstr *, 2>
+GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
+                                                   unsigned SGPRSrcPos) {
+  SmallVector<MachineInstr *, 2> MIList;
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+  MachineInstr *CopySGPR1 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg, RegState::Undef)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0);
+  unsigned SubIdx = TRI->composeSubRegIndices(
+      AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg());
+  CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg());
+  CopySGPR1->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR1);
+  MIList.push_back(CopySGPR1);
+
+  MachineInstr *CopySGPR2 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1);
+  SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1,
+                                     CopySGPR2->getOperand(0).getSubReg());
+  CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg());
+  CopySGPR2->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR2);
+  MIList.push_back(CopySGPR2);
+  return MIList;
+}
+
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+  auto *BB = BeginMI.getParent();
+  auto *MF = BB->getParent();
+  int NumInst = 0;
+
+  auto E = BB->end();
+  auto schedModel = TII->getSchedModel();
+  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
+  int totalCyclesBetweenCandidates = 0;
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
+    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    if (Instr.isMetaInstruction())
+      continue;
+
+    if (Instr.isTerminator())
+      return false;
+
+    if (totalCyclesBetweenCandidates > NumMFMACycles)
+      return false;
+
+    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+      totalCyclesBetweenCandidates += 1;
+      seen.insert(&Instr);
+    }
+  }
+  return true;
+}
+
+SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) {
+
+  SmallVector<MachineInstr *, 2> MIList;
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = DstMO.getReg();
+
+  unsigned SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+
+  const MCInstrDesc instrDesc = I.getDesc();
+
+  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t clampVal = I.getOperand(clampIdx).getImm();
+
+  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
+  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_0) {
+    if (src0_Mods & SISrcMods::NEG) {
+      Lo_src0_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_0) {
+    if (src1_Mods & SISrcMods::NEG) {
+      Lo_src1_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0L_Op1L.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0L_Op1L.addImm(0); //omod
+
+  if (isVreg_64) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  }
+  else {
+    if (I.getOperand(0).isUndef()) {
+      Op0L_Op1L->getOperand(0).setIsUndef();
+    }
+  }
+
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+
+  SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_1) {
+    if (src0_Mods & SISrcMods::NEG_HI) {
+      Hi_src0_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_1) {
+    if (src1_Mods & SISrcMods::NEG_HI) {
+      Hi_src1_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0H_Op1H.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0H_Op1H.addImm(0); //omod
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  MIList.push_back(Op0L_Op1L);
+  MIList.push_back(Op0H_Op1H);
+  return MIList;
+}
+
+void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg1 = I.getOperand(2).getReg();
+  Register SrcReg2 = I.getOperand(4).getReg();
+
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO1 = I.getOperand(2);
+  MachineOperand &SrcMO2 = I.getOperand(4);
+
+  MachineBasicBlock::iterator MII = I;
+  const DebugLoc &DL = I.getDebugLoc();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+  const TargetRegisterClass *Src0SubRC =
+      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
+
+  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
+      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
+    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+      // try with sgpr32
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      }
+    }
+    else {
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      }
+    }
+    return;
+  }
+
+  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, false);
+  }
+  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, true);
+  }
+  return;
+}
+
+bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  auto schedModel = TII->getSchedModel();
+  for (MachineBasicBlock &MBB : MF) {
+    std::unordered_set<MachineInstr *> seen;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, seen);
+      }
+
+    }
+    if (!seen.empty()) {
+      for (MachineInstr *MI : seen) 
+        insertMI(*MI);
+    }
+  }
+  return true;
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 
@@ -248,6 +656,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))

>From 4bff9657e7016452f6657f1c217e804fd354d3ae Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:44:14 -0500
Subject: [PATCH 2/6] add test

---
 ...unpack-non-coissue-insts-post-scheduler.ll | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll

diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
new file mode 100644
index 0000000000000..5c6d376c92e65
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
@@ -0,0 +1,116 @@
+; TODO: change variable names. Make test smaller if possible
+
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at global_smem = external addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.exp2.f32(float)
+
+; Function Attrs: nofree norecurse nounwind
+define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr {
+  %29 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+    
+  %96 = sext i32 %8 to i64
+  %97 = getelementptr half, ptr addrspace(1) %1, i64 %96
+  
+  %115 = icmp slt i32 %29, 16384
+
+  %135 = icmp slt i32 %29, 1
+  
+  %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29
+  %216 = load <8 x half>, ptr addrspace(3) %215, align 16
+  
+  %276 = shl nuw nsw i32 %29, 7
+  
+  %396 = getelementptr half, ptr addrspace(1) %97, i64 1
+  %397 = sext i32 %13 to i64
+  %398 = getelementptr half, ptr addrspace(1) %97, i64 %397
+  
+  %536 = fsub float 0xFFF0000000000000, 0.5
+  %537 = tail call float @llvm.amdgcn.exp2.f32(float %536)
+  
+  %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29
+  %539 = load <8 x half>, ptr addrspace(3) %538, align 16
+  
+  %573 = icmp ult i32 1, 511
+  br i1 %573, label %575, label %574
+
+574:                                              ; preds = %28
+  br label %575
+
+575:                                              ; preds = %574, %28
+  %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  br label %686
+
+686:                                              ; preds = %575, %686
+  %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ]
+  
+  
+  %690 = phi i32 [ 0, %575 ], [ %1120, %686 ]
+  %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ]
+  %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ]
+  
+  %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ]
+  
+  
+  %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+  %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+
+  %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  
+  %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.s.setprio(i16 0)
+  %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0)
+  %879 = extractelement <16 x float> %872, i64 0
+  
+  
+  %957 = insertelement <2 x float> poison, float %.pn347561, i64 0
+  %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer
+  %959 = fmul <2 x float> %759, %958
+  %960 = fmul <2 x float> %760, %958
+  
+  %tmp1 = fmul <2 x float> %tmp6, %958
+  %tmp2 = fmul <2 x float> %tmp7, %958  
+  
+  %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  
+  %1116 = getelementptr half, ptr addrspace(1) %692, i64 1
+  %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397
+  
+  %1119 = icmp slt i32 %690, 2
+  %1120 = select i1 %1119, i32 %690, i32 0
+  %.idx359 = shl i32 %1120, 14
+  %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359
+  
+  %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  
+  %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+
+  %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0)
+  %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0)
+  
+  
+  %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879)
+  
+  %.idx367 = shl i32 %690, 14
+  %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367
+  
+  %1412 = add nuw nsw i32 0, 64
+  %1413 = icmp samesign ult i32 0, 7936
+  %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  
+  br i1 %1413, label %686, label %1510
+
+1510:                                             ; preds = %686
+  ret void
+}

>From d3b19c668d30e4dc906a301c13d2cf6a2e434c7a Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 16:31:30 -0500
Subject: [PATCH 3/6] code cleanup

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index d76502d18f7e7..e2c65bf25d31c 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,19 +39,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
-#include "AMDGPURegisterBankInfo.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 #include <unordered_set>
 
 #include "GCNSchedStrategy.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
@@ -88,7 +81,6 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
-  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -102,7 +94,6 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
-    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -636,7 +627,6 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 

>From c581612e5cd376b5ee6ef19626444dec25e077d6 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 31 Jul 2025 20:02:28 -0500
Subject: [PATCH 4/6] miscellaneous code optimizations and cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 201 ++++++------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  59 ++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   1 +
 3 files changed, 127 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e2c65bf25d31c..844fc1439099f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,6 +28,12 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) 
+/// adjacent to MFMAs such that they can be co-issued.
+/// This helps with overlapping MFMA and certain vector instructions in machine schedules
+/// and is expected to improve performance.
+/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
@@ -38,12 +44,10 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-
+#include "llvm/ADT/DenseSet.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
-#include <unordered_set>
-
 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool unpackInsts(MachineFunction &MF);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
-  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
+  uint16_t mapToUnpackedOpcode(MachineInstr &I);
   SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
                                                        unsigned SGPRSrcPos);
   SmallVector<MachineInstr *, 2>
@@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
-bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
-  // if (!IsGFX942Only)
-  //   return false;
-
-  if (!SIInstrInfo::isVALU(MI)){
-    return false;
-  }
-
-
-  // V_COS, V_EXP, V_RCP, etc.
-  if (SIInstrInfo::isTRANS(MI))
-    return true;
-
-  // DOT2, DOT2C, DOT4, etc.
-  if (SIInstrInfo::isDOT(MI))
-    return true;
-
-  // MFMA, SMFMA
-  if (SIInstrInfo::isMFMA(MI))
-    return true;
-
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_CVT_PK_BF8_F32_e64:
-  case AMDGPU::V_CVT_PK_FP8_F32_e64:
-  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
-  case AMDGPU::V_MQSAD_U32_U8_e64:
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_ADD_I16:
-  case AMDGPU::V_PK_ADD_U16:
-  case AMDGPU::V_PK_ASHRREV_I16:
-  case AMDGPU::V_PK_FMA_F16:
-  case AMDGPU::V_PK_FMA_F32:
-  case AMDGPU::V_PK_FMAC_F16_e32:
-  case AMDGPU::V_PK_FMAC_F16_e64:
-  case AMDGPU::V_PK_LSHLREV_B16:
-  case AMDGPU::V_PK_LSHRREV_B16:
-  case AMDGPU::V_PK_MAD_I16:
-  case AMDGPU::V_PK_MAD_U16:
-  case AMDGPU::V_PK_MAX_F16:
-  case AMDGPU::V_PK_MAX_I16:
-  case AMDGPU::V_PK_MAX_U16:
-  case AMDGPU::V_PK_MIN_F16:
-  case AMDGPU::V_PK_MIN_I16:
-  case AMDGPU::V_PK_MIN_U16:
-  case AMDGPU::V_PK_MOV_B32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_MUL_LO_U16:
-  case AMDGPU::V_PK_SUB_I16:
-  case AMDGPU::V_PK_SUB_U16:
-  case AMDGPU::V_QSAD_PK_U16_U8_e64:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_MUL_F32:
+      return true;
+
+    default:
+      return false;
 
   }
 }
 
-bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  unsigned Opcode = MI.getOpcode();
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+      return AMDGPU::V_ADD_F32_e64;
+    case AMDGPU::V_PK_MUL_F32:
+      return AMDGPU::V_MUL_F32_e64;
+    default:
+      return std::numeric_limits<uint16_t>::max();
 
   }
 }
@@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+    MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
@@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 
     if (Instr.isTerminator())
       return false;
-
+    
     if (totalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
-    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
       totalCyclesBetweenCandidates += 1;
-      seen.insert(&Instr);
+      instrsToUnpack.insert(&Instr);
     }
   }
   return true;
@@ -420,8 +371,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   //don't worry about abs values. Packed instructions (VOP3P) do not support them
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
-
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_0) {
     if (src0_Mods & SISrcMods::NEG) {
@@ -476,7 +427,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_1) {
     if (src0_Mods & SISrcMods::NEG_HI) {
@@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   return;
 }
 
-bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-
-  auto schedModel = TII->getSchedModel();
-  for (MachineBasicBlock &MBB : MF) {
-    std::unordered_set<MachineInstr *> seen;
-    for (MachineInstr &MI : MBB) {
-      if (SIInstrInfo::isMFMA(MI)){
-        createListOfPackedInstr(MI, seen);
-      }
-
-    }
-    if (!seen.empty()) {
-      for (MachineInstr *MI : seen) 
-        insertMI(*MI);
-    }
-  }
-  return true;
-}
-
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
-  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
@@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
-  if (!ST.useRealTrue16Insts())
-    return Changed;
-
   // Add RA hints to improve True16 COPY elimination.
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::COPY)
-        continue;
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src = MI.getOperand(1).getReg();
-      if (Dst.isVirtual() &&
-          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          Src.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
-      if (Src.isVirtual() &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
-      if (!Dst.isVirtual() || !Src.isVirtual())
-        continue;
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
-        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+  for (MachineBasicBlock &MBB : MF) {
+    DenseSet<MachineInstr *> instrsToUnpack;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, instrsToUnpack);
+      }
+      if (ST.useRealTrue16Insts()){
+        if (MI.getOpcode() != AMDGPU::COPY)
+          continue;
+        Register Dst = MI.getOperand(0).getReg();
+        Register Src = MI.getOperand(1).getReg();
+        if (Dst.isVirtual() &&
+            MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            Src.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+        if (Src.isVirtual() &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+            Dst.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+        if (!Dst.isVirtual() || !Src.isVirtual())
+          continue;
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+          MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+        }
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+    }
+    
+    if (!instrsToUnpack.empty()) {
+      for (MachineInstr *MI : instrsToUnpack) 
+        insertMI(*MI);
     }
   }
-
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..5562ff590b71d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
-#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -6173,6 +6172,64 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+  bool IsGFX950Only = ST.hasGFX950Insts();
+  if (!IsGFX950Only)
+    return false;
+
+  if (!isVALU(MI))
+    return false;
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    case AMDGPU::V_CVT_PK_BF8_F32_e64:
+    case AMDGPU::V_CVT_PK_FP8_F32_e64:
+    case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+    case AMDGPU::V_MQSAD_U32_U8_e64:
+    case AMDGPU::V_PK_ADD_F16:
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_ADD_I16:
+    case AMDGPU::V_PK_ADD_U16:
+    case AMDGPU::V_PK_ASHRREV_I16:
+    case AMDGPU::V_PK_FMA_F16:
+    case AMDGPU::V_PK_FMA_F32:
+    case AMDGPU::V_PK_FMAC_F16_e32:
+    case AMDGPU::V_PK_FMAC_F16_e64:
+    case AMDGPU::V_PK_LSHLREV_B16:
+    case AMDGPU::V_PK_LSHRREV_B16:
+    case AMDGPU::V_PK_MAD_I16:
+    case AMDGPU::V_PK_MAD_U16:
+    case AMDGPU::V_PK_MAX_F16:
+    case AMDGPU::V_PK_MAX_I16:
+    case AMDGPU::V_PK_MAX_U16:
+    case AMDGPU::V_PK_MIN_F16:
+    case AMDGPU::V_PK_MIN_I16:
+    case AMDGPU::V_PK_MIN_U16:
+    case AMDGPU::V_PK_MOV_B32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_LO_U16:
+    case AMDGPU::V_PK_SUB_I16:
+    case AMDGPU::V_PK_SUB_U16:
+    case AMDGPU::V_QSAD_PK_U16_U8_e64:
+      return true;
+    default:
+      return false;
+    }
+}
+
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59eb0f04..b7a0388470279 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1178,6 +1178,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
+  bool isNeverCoissue(MachineInstr &MI) const;
   /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;

>From c695b99ddae061127e015daf523b8eeec7888b71 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 1 Aug 2025 09:14:29 -0500
Subject: [PATCH 5/6] add code comments

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 844fc1439099f..0f7009a6ea394 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -262,6 +262,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons
 
 uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   unsigned Opcode = I.getOpcode();
+  // use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
     case AMDGPU::V_PK_ADD_F32:
       return AMDGPU::V_ADD_F32_e64;

>From 1a51a42d4c633cd1a1a84878b2a3dce6764473b4 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 6 Aug 2025 16:24:08 -0500
Subject: [PATCH 6/6] removing repetitive code, capitalize vars

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 95 +++++++------------
 1 file changed, 36 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f7009a6ea394..f56d73e990269 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -318,25 +318,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   int NumInst = 0;
 
   auto E = BB->end();
-  auto schedModel = TII->getSchedModel();
-  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
-  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
-  int totalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+  int TotalCyclesBetweenCandidates = 0;
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
-    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
     if (Instr.isTerminator())
       return false;
     
-    if (totalCyclesBetweenCandidates > NumMFMACycles)
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      totalCyclesBetweenCandidates += 1;
+      TotalCyclesBetweenCandidates += 1;
       instrsToUnpack.insert(&Instr);
     }
   }
@@ -411,10 +411,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   if (isVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
-  else {
-    if (I.getOperand(0).isUndef()) {
-      Op0L_Op1L->getOperand(0).setIsUndef();
-    }
+  else if (I.getOperand(0).isUndef()){
+    Op0L_Op1L->getOperand(0).setIsUndef();
   }
 
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
@@ -499,58 +497,37 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
 
-  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
-      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
-    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
-      // try with sgpr32
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      }
-    }
-    else {
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      }
-    }
-    return;
-  }
+  if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    // try with sgpr32
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
 
-  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, false);
+        I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+        CopySGPR2->getOperand(0), isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+    return;
   }
-  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+  else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
+
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, true);
+        I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+    return;
   }
+
+  bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+  SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+          I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+          SrcMO2, isVReg64);
   return;
 }