[llvm] Co-issue packed instructions by unpacking (PR #151704)

Mon Sep 8 12:04:17 PDT 2025

https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/151704

>From 7c443285ec2df426ca1ff93f236fcb67d735338f Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:40:34 -0500
Subject: [PATCH 01/16] initial commit

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 409 ++++++++++++++++++
 1 file changed, 409 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..d76502d18f7e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,6 +39,21 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
+#include "AMDGPURegisterBankInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"
+#include <unordered_set>
+
+#include "GCNSchedStrategy.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -53,6 +68,17 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  bool unpackInsts(MachineFunction &MF);
+  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
+  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+  void insertMI(MachineInstr &I);
+  SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
+                                                       unsigned SGPRSrcPos);
+  SmallVector<MachineInstr *, 2>
+  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+                   MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+                   bool isVreg_64);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -62,6 +88,7 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
+  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -75,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -225,10 +253,390 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
+bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
+  // if (!IsGFX942Only)
+  //   return false;
+
+  if (!SIInstrInfo::isVALU(MI)){
+    return false;
+  }
+
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (SIInstrInfo::isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (SIInstrInfo::isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (SIInstrInfo::isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_CVT_PK_BF8_F32_e64:
+  case AMDGPU::V_CVT_PK_FP8_F32_e64:
+  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+  case AMDGPU::V_MQSAD_U32_U8_e64:
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_ADD_I16:
+  case AMDGPU::V_PK_ADD_U16:
+  case AMDGPU::V_PK_ASHRREV_I16:
+  case AMDGPU::V_PK_FMA_F16:
+  case AMDGPU::V_PK_FMA_F32:
+  case AMDGPU::V_PK_FMAC_F16_e32:
+  case AMDGPU::V_PK_FMAC_F16_e64:
+  case AMDGPU::V_PK_LSHLREV_B16:
+  case AMDGPU::V_PK_LSHRREV_B16:
+  case AMDGPU::V_PK_MAD_I16:
+  case AMDGPU::V_PK_MAD_U16:
+  case AMDGPU::V_PK_MAX_F16:
+  case AMDGPU::V_PK_MAX_I16:
+  case AMDGPU::V_PK_MAX_U16:
+  case AMDGPU::V_PK_MIN_F16:
+  case AMDGPU::V_PK_MIN_I16:
+  case AMDGPU::V_PK_MIN_U16:
+  case AMDGPU::V_PK_MOV_B32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_LO_U16:
+  case AMDGPU::V_PK_SUB_I16:
+  case AMDGPU::V_PK_SUB_U16:
+  case AMDGPU::V_QSAD_PK_U16_U8_e64:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+SmallVector<MachineInstr *, 2>
+GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
+                                                   unsigned SGPRSrcPos) {
+  SmallVector<MachineInstr *, 2> MIList;
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+  MachineInstr *CopySGPR1 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg, RegState::Undef)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0);
+  unsigned SubIdx = TRI->composeSubRegIndices(
+      AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg());
+  CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg());
+  CopySGPR1->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR1);
+  MIList.push_back(CopySGPR1);
+
+  MachineInstr *CopySGPR2 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1);
+  SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1,
+                                     CopySGPR2->getOperand(0).getSubReg());
+  CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg());
+  CopySGPR2->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR2);
+  MIList.push_back(CopySGPR2);
+  return MIList;
+}
+
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+  auto *BB = BeginMI.getParent();
+  auto *MF = BB->getParent();
+  int NumInst = 0;
+
+  auto E = BB->end();
+  auto schedModel = TII->getSchedModel();
+  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
+  int totalCyclesBetweenCandidates = 0;
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
+    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    if (Instr.isMetaInstruction())
+      continue;
+
+    if (Instr.isTerminator())
+      return false;
+
+    if (totalCyclesBetweenCandidates > NumMFMACycles)
+      return false;
+
+    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+      totalCyclesBetweenCandidates += 1;
+      seen.insert(&Instr);
+    }
+  }
+  return true;
+}
+
+SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) {
+
+  SmallVector<MachineInstr *, 2> MIList;
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = DstMO.getReg();
+
+  unsigned SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+
+  const MCInstrDesc instrDesc = I.getDesc();
+
+  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t clampVal = I.getOperand(clampIdx).getImm();
+
+  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
+  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_0) {
+    if (src0_Mods & SISrcMods::NEG) {
+      Lo_src0_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_0) {
+    if (src1_Mods & SISrcMods::NEG) {
+      Lo_src1_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0L_Op1L.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0L_Op1L.addImm(0); //omod
+
+  if (isVreg_64) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  }
+  else {
+    if (I.getOperand(0).isUndef()) {
+      Op0L_Op1L->getOperand(0).setIsUndef();
+    }
+  }
+
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+
+  SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_1) {
+    if (src0_Mods & SISrcMods::NEG_HI) {
+      Hi_src0_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_1) {
+    if (src1_Mods & SISrcMods::NEG_HI) {
+      Hi_src1_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0H_Op1H.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0H_Op1H.addImm(0); //omod
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  MIList.push_back(Op0L_Op1L);
+  MIList.push_back(Op0H_Op1H);
+  return MIList;
+}
+
+void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg1 = I.getOperand(2).getReg();
+  Register SrcReg2 = I.getOperand(4).getReg();
+
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO1 = I.getOperand(2);
+  MachineOperand &SrcMO2 = I.getOperand(4);
+
+  MachineBasicBlock::iterator MII = I;
+  const DebugLoc &DL = I.getDebugLoc();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+  const TargetRegisterClass *Src0SubRC =
+      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
+
+  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
+      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
+    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+      // try with sgpr32
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      }
+    }
+    else {
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      }
+    }
+    return;
+  }
+
+  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, false);
+  }
+  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, true);
+  }
+  return;
+}
+
+bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  auto schedModel = TII->getSchedModel();
+  for (MachineBasicBlock &MBB : MF) {
+    std::unordered_set<MachineInstr *> seen;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, seen);
+      }
+
+    }
+    if (!seen.empty()) {
+      for (MachineInstr *MI : seen) 
+        insertMI(*MI);
+    }
+  }
+  return true;
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 
@@ -248,6 +656,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))

>From 4bff9657e7016452f6657f1c217e804fd354d3ae Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:44:14 -0500
Subject: [PATCH 02/16] add test

---
 ...unpack-non-coissue-insts-post-scheduler.ll | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll

diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
new file mode 100644
index 0000000000000..5c6d376c92e65
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
@@ -0,0 +1,116 @@
+; TODO: change variable names. Make test smaller if possible
+
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at global_smem = external addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.exp2.f32(float)
+
+; Function Attrs: nofree norecurse nounwind
+define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr {
+  %29 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+    
+  %96 = sext i32 %8 to i64
+  %97 = getelementptr half, ptr addrspace(1) %1, i64 %96
+  
+  %115 = icmp slt i32 %29, 16384
+
+  %135 = icmp slt i32 %29, 1
+  
+  %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29
+  %216 = load <8 x half>, ptr addrspace(3) %215, align 16
+  
+  %276 = shl nuw nsw i32 %29, 7
+  
+  %396 = getelementptr half, ptr addrspace(1) %97, i64 1
+  %397 = sext i32 %13 to i64
+  %398 = getelementptr half, ptr addrspace(1) %97, i64 %397
+  
+  %536 = fsub float 0xFFF0000000000000, 0.5
+  %537 = tail call float @llvm.amdgcn.exp2.f32(float %536)
+  
+  %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29
+  %539 = load <8 x half>, ptr addrspace(3) %538, align 16
+  
+  %573 = icmp ult i32 1, 511
+  br i1 %573, label %575, label %574
+
+574:                                              ; preds = %28
+  br label %575
+
+575:                                              ; preds = %574, %28
+  %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  br label %686
+
+686:                                              ; preds = %575, %686
+  %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ]
+  
+  
+  %690 = phi i32 [ 0, %575 ], [ %1120, %686 ]
+  %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ]
+  %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ]
+  
+  %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ]
+  
+  
+  %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+  %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+
+  %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  
+  %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.s.setprio(i16 0)
+  %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0)
+  %879 = extractelement <16 x float> %872, i64 0
+  
+  
+  %957 = insertelement <2 x float> poison, float %.pn347561, i64 0
+  %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer
+  %959 = fmul <2 x float> %759, %958
+  %960 = fmul <2 x float> %760, %958
+  
+  %tmp1 = fmul <2 x float> %tmp6, %958
+  %tmp2 = fmul <2 x float> %tmp7, %958  
+  
+  %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  
+  %1116 = getelementptr half, ptr addrspace(1) %692, i64 1
+  %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397
+  
+  %1119 = icmp slt i32 %690, 2
+  %1120 = select i1 %1119, i32 %690, i32 0
+  %.idx359 = shl i32 %1120, 14
+  %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359
+  
+  %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  
+  %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+
+  %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0)
+  %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0)
+  
+  
+  %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879)
+  
+  %.idx367 = shl i32 %690, 14
+  %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367
+  
+  %1412 = add nuw nsw i32 0, 64
+  %1413 = icmp samesign ult i32 0, 7936
+  %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  
+  br i1 %1413, label %686, label %1510
+
+1510:                                             ; preds = %686
+  ret void
+}

>From d3b19c668d30e4dc906a301c13d2cf6a2e434c7a Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 16:31:30 -0500
Subject: [PATCH 03/16] code cleanup

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index d76502d18f7e7..e2c65bf25d31c 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,19 +39,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
-#include "AMDGPURegisterBankInfo.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 #include <unordered_set>
 
 #include "GCNSchedStrategy.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
@@ -88,7 +81,6 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
-  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -102,7 +94,6 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
-    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -636,7 +627,6 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 

>From c581612e5cd376b5ee6ef19626444dec25e077d6 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 31 Jul 2025 20:02:28 -0500
Subject: [PATCH 04/16] miscellaneous code optimizations and cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 201 ++++++------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  59 ++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   1 +
 3 files changed, 127 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e2c65bf25d31c..844fc1439099f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,6 +28,12 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) 
+/// adjacent to MFMAs such that they can be co-issued.
+/// This helps with overlapping MFMA and certain vector instructions in machine schedules
+/// and is expected to improve performance.
+/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
@@ -38,12 +44,10 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-
+#include "llvm/ADT/DenseSet.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
-#include <unordered_set>
-
 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool unpackInsts(MachineFunction &MF);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
-  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
+  uint16_t mapToUnpackedOpcode(MachineInstr &I);
   SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
                                                        unsigned SGPRSrcPos);
   SmallVector<MachineInstr *, 2>
@@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
-bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
-  // if (!IsGFX942Only)
-  //   return false;
-
-  if (!SIInstrInfo::isVALU(MI)){
-    return false;
-  }
-
-
-  // V_COS, V_EXP, V_RCP, etc.
-  if (SIInstrInfo::isTRANS(MI))
-    return true;
-
-  // DOT2, DOT2C, DOT4, etc.
-  if (SIInstrInfo::isDOT(MI))
-    return true;
-
-  // MFMA, SMFMA
-  if (SIInstrInfo::isMFMA(MI))
-    return true;
-
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_CVT_PK_BF8_F32_e64:
-  case AMDGPU::V_CVT_PK_FP8_F32_e64:
-  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
-  case AMDGPU::V_MQSAD_U32_U8_e64:
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_ADD_I16:
-  case AMDGPU::V_PK_ADD_U16:
-  case AMDGPU::V_PK_ASHRREV_I16:
-  case AMDGPU::V_PK_FMA_F16:
-  case AMDGPU::V_PK_FMA_F32:
-  case AMDGPU::V_PK_FMAC_F16_e32:
-  case AMDGPU::V_PK_FMAC_F16_e64:
-  case AMDGPU::V_PK_LSHLREV_B16:
-  case AMDGPU::V_PK_LSHRREV_B16:
-  case AMDGPU::V_PK_MAD_I16:
-  case AMDGPU::V_PK_MAD_U16:
-  case AMDGPU::V_PK_MAX_F16:
-  case AMDGPU::V_PK_MAX_I16:
-  case AMDGPU::V_PK_MAX_U16:
-  case AMDGPU::V_PK_MIN_F16:
-  case AMDGPU::V_PK_MIN_I16:
-  case AMDGPU::V_PK_MIN_U16:
-  case AMDGPU::V_PK_MOV_B32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_MUL_LO_U16:
-  case AMDGPU::V_PK_SUB_I16:
-  case AMDGPU::V_PK_SUB_U16:
-  case AMDGPU::V_QSAD_PK_U16_U8_e64:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_MUL_F32:
+      return true;
+
+    default:
+      return false;
 
   }
 }
 
-bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  unsigned Opcode = MI.getOpcode();
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+      return AMDGPU::V_ADD_F32_e64;
+    case AMDGPU::V_PK_MUL_F32:
+      return AMDGPU::V_MUL_F32_e64;
+    default:
+      return std::numeric_limits<uint16_t>::max();
 
   }
 }
@@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+    MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
@@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 
     if (Instr.isTerminator())
       return false;
-
+    
     if (totalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
-    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
       totalCyclesBetweenCandidates += 1;
-      seen.insert(&Instr);
+      instrsToUnpack.insert(&Instr);
     }
   }
   return true;
@@ -420,8 +371,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   //don't worry about abs values. Packed instructions (VOP3P) do not support them
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
-
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_0) {
     if (src0_Mods & SISrcMods::NEG) {
@@ -476,7 +427,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_1) {
     if (src0_Mods & SISrcMods::NEG_HI) {
@@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   return;
 }
 
-bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-
-  auto schedModel = TII->getSchedModel();
-  for (MachineBasicBlock &MBB : MF) {
-    std::unordered_set<MachineInstr *> seen;
-    for (MachineInstr &MI : MBB) {
-      if (SIInstrInfo::isMFMA(MI)){
-        createListOfPackedInstr(MI, seen);
-      }
-
-    }
-    if (!seen.empty()) {
-      for (MachineInstr *MI : seen) 
-        insertMI(*MI);
-    }
-  }
-  return true;
-}
-
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
-  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
@@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
-  if (!ST.useRealTrue16Insts())
-    return Changed;
-
   // Add RA hints to improve True16 COPY elimination.
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::COPY)
-        continue;
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src = MI.getOperand(1).getReg();
-      if (Dst.isVirtual() &&
-          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          Src.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
-      if (Src.isVirtual() &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
-      if (!Dst.isVirtual() || !Src.isVirtual())
-        continue;
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
-        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+  for (MachineBasicBlock &MBB : MF) {
+    DenseSet<MachineInstr *> instrsToUnpack;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, instrsToUnpack);
+      }
+      if (ST.useRealTrue16Insts()){
+        if (MI.getOpcode() != AMDGPU::COPY)
+          continue;
+        Register Dst = MI.getOperand(0).getReg();
+        Register Src = MI.getOperand(1).getReg();
+        if (Dst.isVirtual() &&
+            MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            Src.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+        if (Src.isVirtual() &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+            Dst.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+        if (!Dst.isVirtual() || !Src.isVirtual())
+          continue;
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+          MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+        }
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+    }
+    
+    if (!instrsToUnpack.empty()) {
+      for (MachineInstr *MI : instrsToUnpack) 
+        insertMI(*MI);
     }
   }
-
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..5562ff590b71d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
-#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -6173,6 +6172,64 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+  bool IsGFX950Only = ST.hasGFX950Insts();
+  if (!IsGFX950Only)
+    return false;
+
+  if (!isVALU(MI))
+    return false;
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    case AMDGPU::V_CVT_PK_BF8_F32_e64:
+    case AMDGPU::V_CVT_PK_FP8_F32_e64:
+    case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+    case AMDGPU::V_MQSAD_U32_U8_e64:
+    case AMDGPU::V_PK_ADD_F16:
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_ADD_I16:
+    case AMDGPU::V_PK_ADD_U16:
+    case AMDGPU::V_PK_ASHRREV_I16:
+    case AMDGPU::V_PK_FMA_F16:
+    case AMDGPU::V_PK_FMA_F32:
+    case AMDGPU::V_PK_FMAC_F16_e32:
+    case AMDGPU::V_PK_FMAC_F16_e64:
+    case AMDGPU::V_PK_LSHLREV_B16:
+    case AMDGPU::V_PK_LSHRREV_B16:
+    case AMDGPU::V_PK_MAD_I16:
+    case AMDGPU::V_PK_MAD_U16:
+    case AMDGPU::V_PK_MAX_F16:
+    case AMDGPU::V_PK_MAX_I16:
+    case AMDGPU::V_PK_MAX_U16:
+    case AMDGPU::V_PK_MIN_F16:
+    case AMDGPU::V_PK_MIN_I16:
+    case AMDGPU::V_PK_MIN_U16:
+    case AMDGPU::V_PK_MOV_B32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_LO_U16:
+    case AMDGPU::V_PK_SUB_I16:
+    case AMDGPU::V_PK_SUB_U16:
+    case AMDGPU::V_QSAD_PK_U16_U8_e64:
+      return true;
+    default:
+      return false;
+    }
+}
+
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59eb0f04..b7a0388470279 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1178,6 +1178,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
+  bool isNeverCoissue(MachineInstr &MI) const;
   /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;

>From c695b99ddae061127e015daf523b8eeec7888b71 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 1 Aug 2025 09:14:29 -0500
Subject: [PATCH 05/16] add code comments

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 844fc1439099f..0f7009a6ea394 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -262,6 +262,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons
 
 uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   unsigned Opcode = I.getOpcode();
+  // use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
     case AMDGPU::V_PK_ADD_F32:
       return AMDGPU::V_ADD_F32_e64;

>From 1a51a42d4c633cd1a1a84878b2a3dce6764473b4 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 6 Aug 2025 16:24:08 -0500
Subject: [PATCH 06/16] removing repetitive code, capitalize vars

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 95 +++++++------------
 1 file changed, 36 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f7009a6ea394..f56d73e990269 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -318,25 +318,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   int NumInst = 0;
 
   auto E = BB->end();
-  auto schedModel = TII->getSchedModel();
-  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
-  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
-  int totalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+  int TotalCyclesBetweenCandidates = 0;
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
-    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
     if (Instr.isTerminator())
       return false;
     
-    if (totalCyclesBetweenCandidates > NumMFMACycles)
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      totalCyclesBetweenCandidates += 1;
+      TotalCyclesBetweenCandidates += 1;
       instrsToUnpack.insert(&Instr);
     }
   }
@@ -411,10 +411,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   if (isVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
-  else {
-    if (I.getOperand(0).isUndef()) {
-      Op0L_Op1L->getOperand(0).setIsUndef();
-    }
+  else if (I.getOperand(0).isUndef()){
+    Op0L_Op1L->getOperand(0).setIsUndef();
   }
 
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
@@ -499,58 +497,37 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
 
-  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
-      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
-    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
-      // try with sgpr32
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      }
-    }
-    else {
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      }
-    }
-    return;
-  }
+  if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    // try with sgpr32
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
 
-  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, false);
+        I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+        CopySGPR2->getOperand(0), isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+    return;
   }
-  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+  else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
+
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, true);
+        I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+    return;
   }
+
+  bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+  SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+          I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+          SrcMO2, isVReg64);
   return;
 }
 

>From e9056e866ab3dd91e145430e83b9603f76d8b486 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 15 Aug 2025 18:00:36 -0500
Subject: [PATCH 07/16] adding support for FP16 ops

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 316 +++++++++++++++++-
 ...unpack-non-coissue-insts-post-scheduler.ll | 116 -------
 2 files changed, 302 insertions(+), 130 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index f56d73e990269..33e07c5a16d97 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -44,13 +44,14 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -65,7 +66,7 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
+  bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
@@ -75,6 +76,10 @@ class GCNPreRAOptimizationsImpl {
   insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
                    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                    bool isVreg_64);
+  void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
+  bool IsF16MaskSet;
+  Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions
+  Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -252,6 +257,8 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons
   switch (Opcode) {
     case AMDGPU::V_PK_ADD_F32:
     case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_ADD_F16:
       return true;
 
     default:
@@ -270,6 +277,10 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
       return AMDGPU::V_ADD_F32_e64;
     case AMDGPU::V_PK_MUL_F32:
       return AMDGPU::V_MUL_F32_e64;
+    case AMDGPU::V_PK_ADD_F16:
+      return AMDGPU::V_ADD_F16_e64;
+    case AMDGPU::V_PK_MUL_F16:
+      return AMDGPU::V_MUL_F16_e64;
     default:
       return std::numeric_limits<uint16_t>::max();
 
@@ -312,16 +323,15 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
 
   auto E = BB->end();
-  auto SchedModel = TII->getSchedModel();
-  const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI);
-  const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+  
   int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
     const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
@@ -334,10 +344,41 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
     
     if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
-
+    
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      TotalCyclesBetweenCandidates += 1;
-      instrsToUnpack.insert(&Instr);
+      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){
+        // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand
+        // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register
+        // The following sequence of instructions are issued
+        
+        // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, 
+        // then immediates are not allowed. Hence, the need to move these into vgprs
+        
+        // vgpr_32 = V_MOV_B32_e32 65535
+        // vgpr_32 = V_MOV_B32_e32 16
+
+        // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64
+        // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
+        // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
+        
+        // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set
+        // subsequent unpacking only needs to issue the remaining instructions
+        // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity.
+        if (IsF16MaskSet) 
+          TotalCyclesBetweenCandidates += 7;
+        else
+          TotalCyclesBetweenCandidates += 9;
+      }
+      else
+        TotalCyclesBetweenCandidates += 1;
+      
+      if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
+        instrsToUnpack.insert(&Instr);
     }
   }
   return true;
@@ -531,6 +572,242 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   return;
 }
 
+void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO0 = I.getOperand(2);
+  MachineOperand &SrcMO1 = I.getOperand(4);
+
+  Register DstReg = DstMO.getReg();
+  Register SrcReg0 = SrcMO0.getReg();
+  Register SrcReg1 = SrcMO1.getReg();
+
+  const DebugLoc &DL = I.getDebugLoc();
+  
+  const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
+  auto SchedModel = TII->getSchedModel();
+
+  uint16_t AddlCyclesConsumed = 0;
+  SetVector<MachineInstr *> ListOfNewInstructions;
+
+  auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
+    Register ImmReg = MRI.createVirtualRegister(RC);
+    auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
+        .addImm(Val);
+    LIS->InsertMachineInstrInMaps(*newMI);
+    const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI);
+    uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+    return {ImmReg, LatencyCycles};
+  };
+
+
+  if (!IsF16MaskSet) {
+    std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
+    MaskLo = RegAndLatency.first; //mask for lower 16 bits
+    AddlCyclesConsumed += RegAndLatency.second;
+    RegAndLatency = BuildImm(16);
+    ShiftAmt =  RegAndLatency.first; //mask for higher 16 bits
+    AddlCyclesConsumed += RegAndLatency.second;
+    IsF16MaskSet = true;
+  }
+  
+  Register Src0_Lo = MRI.createVirtualRegister(RC);
+  Register Src1_Lo = MRI.createVirtualRegister(RC);
+  Register Src0_Hi = MRI.createVirtualRegister(RC);
+  Register Src1_Hi = MRI.createVirtualRegister(RC);
+  Register Input0 = MRI.createVirtualRegister(RC);
+  Register Input1 = MRI.createVirtualRegister(RC);
+
+  unsigned SubRegID = 0;
+  if (SrcMO0.getSubReg())
+    SubRegID = SrcMO0.getSubReg();
+
+  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
+  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t clampVal = I.getOperand(clampIdx).getImm();
+
+  // handle op_sel for src0
+  if (src0_Mods & SISrcMods::OP_SEL_0) {
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      LoInput0_MI.addReg(SrcReg0);
+    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
+  }
+  else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
+    if (SubRegID)
+      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      LoInput0_MI.addReg(SrcReg0);
+    LoInput0_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
+  }
+
+  // handle op_sel_hi for src0
+  if (src0_Mods & SISrcMods::OP_SEL_1) {
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      HiInput0_MI.addReg(SrcReg0);
+    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
+  }
+  else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
+    if (SubRegID)
+      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      HiInput0_MI.addReg(SrcReg0);
+    HiInput0_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
+  }
+
+  SubRegID = 0;
+  if (SrcMO0.getSubReg())
+    SubRegID = SrcMO1.getSubReg();
+  // handle op_sel for src1
+  if (src1_Mods & SISrcMods::OP_SEL_0) {
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      LoInput1_MI.addReg(SrcReg1);
+    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
+  }
+  else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
+    if (SubRegID)
+      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      LoInput1_MI.addReg(SrcReg1);
+    LoInput1_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
+  }
+
+  // handle op_sel_hi for src1
+  if (src1_Mods & SISrcMods::OP_SEL_1) {
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      HiInput1_MI.addReg(SrcReg1);
+    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
+  }
+  else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
+    if (SubRegID)
+      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      HiInput1_MI.addReg(SrcReg1);
+    HiInput1_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
+  }
+
+  Register LoMul = MRI.createVirtualRegister(RC);
+  Register HiMul = MRI.createVirtualRegister(RC);
+
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  
+  // Unpacked instructions
+  MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
+
+  if (src0_Mods & SISrcMods::NEG) 
+    Lo_src0_mods |= SISrcMods::NEG;
+
+  LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers
+  LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0
+
+  if (src1_Mods & SISrcMods::NEG)
+    Lo_src1_mods |= SISrcMods::NEG;
+
+  LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers
+  LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1
+  LoMul_MI.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  LoMul_MI.addImm(0); //omod
+
+  // unpacked instruction with VOP3 encoding for Hi bits 
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+
+  MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
+  if (src0_Mods & SISrcMods::NEG_HI) 
+    Hi_src0_mods |= SISrcMods::NEG_HI;
+  
+  HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers
+  HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set
+
+  if (src1_Mods & SISrcMods::NEG_HI)
+    Hi_src1_mods |= SISrcMods::NEG_HI;
+  
+  HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers
+  HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set
+  HiMul_MI.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  HiMul_MI.addImm(0); //omod
+
+  // Shift HiMul left by 16
+  Register HiMulShifted = MRI.createVirtualRegister(RC);
+  MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
+      .addReg(ShiftAmt)
+      .addReg(HiMul);
+
+  SubRegID = 0;
+  if (DstMO.getSubReg())
+    SubRegID = DstMO.getSubReg();
+  // OR LoMul | (HiMul << 16)
+  MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
+  if (SubRegID) {
+    if (DstMO.isUndef()){
+      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID);
+    }
+    else {
+      RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID);
+    }
+  }
+  else {
+    if (DstMO.isUndef()){
+      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef);
+    }
+    else {
+      RewriteBackToDst_MI.addDef(DstReg);
+    }
+  }
+  RewriteBackToDst_MI.addReg(LoMul);
+  RewriteBackToDst_MI.addReg(HiMulShifted);
+  
+  LIS->InsertMachineInstrInMaps(*LoMul_MI);
+  LIS->InsertMachineInstrInMaps(*HiMul_MI);
+  LIS->InsertMachineInstrInMaps(*HiMulShifted_MI);
+  LIS->InsertMachineInstrInMaps(*RewriteBackToDst_MI);
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -569,10 +846,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   // Add RA hints to improve True16 COPY elimination.
   // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
-    DenseSet<MachineInstr *> instrsToUnpack;
+    SetVector<MachineInstr *> instrsToUnpack;
+    IsF16MaskSet = false;
+    uint16_t NumMFMACycles = 0;
+    auto SchedModel = TII->getSchedModel();
     for (MachineInstr &MI : MBB) {
       if (SIInstrInfo::isMFMA(MI)){
-        createListOfPackedInstr(MI, instrsToUnpack);
+        const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
+        NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
       if (ST.useRealTrue16Insts()){
         if (MI.getOpcode() != AMDGPU::COPY)
@@ -603,9 +885,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     }
     
     if (!instrsToUnpack.empty()) {
-      for (MachineInstr *MI : instrsToUnpack) 
-        insertMI(*MI);
+      for (MachineInstr *MI : instrsToUnpack) {
+        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+          processF16Unpacking(*MI, NumMFMACycles);
+        }
+        else {
+          insertMI(*MI);
+        }
+      }
     }
   }
   return Changed;
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
deleted file mode 100644
index 5c6d376c92e65..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; TODO: change variable names. Make test smaller if possible
-
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
- at global_smem = external addrspace(3) global [0 x i8], align 16
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare float @llvm.amdgcn.exp2.f32(float)
-
-; Function Attrs: nofree norecurse nounwind
-define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr {
-  %29 = tail call i32 @llvm.amdgcn.workgroup.id.x()
-    
-  %96 = sext i32 %8 to i64
-  %97 = getelementptr half, ptr addrspace(1) %1, i64 %96
-  
-  %115 = icmp slt i32 %29, 16384
-
-  %135 = icmp slt i32 %29, 1
-  
-  %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29
-  %216 = load <8 x half>, ptr addrspace(3) %215, align 16
-  
-  %276 = shl nuw nsw i32 %29, 7
-  
-  %396 = getelementptr half, ptr addrspace(1) %97, i64 1
-  %397 = sext i32 %13 to i64
-  %398 = getelementptr half, ptr addrspace(1) %97, i64 %397
-  
-  %536 = fsub float 0xFFF0000000000000, 0.5
-  %537 = tail call float @llvm.amdgcn.exp2.f32(float %536)
-  
-  %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29
-  %539 = load <8 x half>, ptr addrspace(3) %538, align 16
-  
-  %573 = icmp ult i32 1, 511
-  br i1 %573, label %575, label %574
-
-574:                                              ; preds = %28
-  br label %575
-
-575:                                              ; preds = %574, %28
-  %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> <i32 0, i32 1>
-  
-  br label %686
-
-686:                                              ; preds = %575, %686
-  %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ]
-  
-  
-  %690 = phi i32 [ 0, %575 ], [ %1120, %686 ]
-  %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ]
-  %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ]
-  
-  %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ]
-  
-  
-  %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
-  %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
-
-  %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
-  %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
-  
-  %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
-  tail call void @llvm.amdgcn.s.setprio(i16 0)
-  %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0)
-  %879 = extractelement <16 x float> %872, i64 0
-  
-  
-  %957 = insertelement <2 x float> poison, float %.pn347561, i64 0
-  %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer
-  %959 = fmul <2 x float> %759, %958
-  %960 = fmul <2 x float> %760, %958
-  
-  %tmp1 = fmul <2 x float> %tmp6, %958
-  %tmp2 = fmul <2 x float> %tmp7, %958  
-  
-  %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  
-  %1116 = getelementptr half, ptr addrspace(1) %692, i64 1
-  %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397
-  
-  %1119 = icmp slt i32 %690, 2
-  %1120 = select i1 %1119, i32 %690, i32 0
-  %.idx359 = shl i32 %1120, 14
-  %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359
-  
-  %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-  
-  %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-
-  %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0)
-  %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0)
-  
-  
-  %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879)
-  
-  %.idx367 = shl i32 %690, 14
-  %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367
-  
-  %1412 = add nuw nsw i32 0, 64
-  %1413 = icmp samesign ult i32 0, 7936
-  %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> <i32 0, i32 1>
-  
-  %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> <i32 0, i32 1>
-  %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> <i32 0, i32 1>
-  
-  br i1 %1413, label %686, label %1510
-
-1510:                                             ; preds = %686
-  ret void
-}

>From 5cb47d262a7d865e2ce9fa006e079db2676b4edb Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 15 Aug 2025 18:24:13 -0500
Subject: [PATCH 08/16] code fix

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 33e07c5a16d97..5dac4a210101e 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -854,7 +854,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
       if (SIInstrInfo::isMFMA(MI)){
         const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
         NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-        // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
+        createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
       if (ST.useRealTrue16Insts()){
         if (MI.getOpcode() != AMDGPU::COPY)

>From 178a36354b4b109c4c59572f90205457386c77e6 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Sun, 17 Aug 2025 09:32:00 -0500
Subject: [PATCH 09/16] clang-formatted and mir tests added

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 493 ++++++++++--------
 ...npack-non-coissue-insts-post-scheduler.mir | 209 ++++++++
 2 files changed, 482 insertions(+), 220 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 5dac4a210101e..9a2f898dcb2de 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,30 +28,28 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
-/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) 
-/// adjacent to MFMAs such that they can be co-issued.
-/// This helps with overlapping MFMA and certain vector instructions in machine schedules
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and
+/// V_PK_ADD_F32) adjacent to MFMAs such that they can be co-issued. This helps
+/// with overlapping MFMA and certain vector instructions in machine schedules
 /// and is expected to improve performance.
-/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
-/// Rest should remain untouched.
+/// Only those packed instructions are unpacked that are overlapped by the MFMA
+/// latency. Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
-#include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
+#include "GCNPreRAOptimizations.h"
+#include "GCNSchedStrategy.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ADT/SetVector.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/InitializePasses.h"
-#include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include <utility>
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -66,20 +64,24 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles);
+  bool createListOfPackedInstr(MachineInstr &BeginMI,
+                               SetVector<MachineInstr *> &instrsToUnpack,
+                               uint16_t NumMFMACycles);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
   SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
                                                        unsigned SGPRSrcPos);
   SmallVector<MachineInstr *, 2>
-  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
-                   MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO,
+                   MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+                   MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                    bool isVreg_64);
   void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
   bool IsF16MaskSet;
-  Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions
-  Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions
+  Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions
+  Register
+      ShiftAmt; // mask to extract higher 16 bits from F16 packed instructions
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -252,18 +254,18 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
-bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
+    MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-    case AMDGPU::V_PK_ADD_F32:
-    case AMDGPU::V_PK_MUL_F32:
-    case AMDGPU::V_PK_MUL_F16:
-    case AMDGPU::V_PK_ADD_F16:
-      return true;
-
-    default:
-      return false;
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_ADD_F16:
+    return true;
 
+  default:
+    return false;
   }
 }
 
@@ -273,23 +275,22 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
   // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
-    case AMDGPU::V_PK_ADD_F32:
-      return AMDGPU::V_ADD_F32_e64;
-    case AMDGPU::V_PK_MUL_F32:
-      return AMDGPU::V_MUL_F32_e64;
-    case AMDGPU::V_PK_ADD_F16:
-      return AMDGPU::V_ADD_F16_e64;
-    case AMDGPU::V_PK_MUL_F16:
-      return AMDGPU::V_MUL_F16_e64;
-    default:
-      return std::numeric_limits<uint16_t>::max();
-
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_ADD_F16:
+    return AMDGPU::V_ADD_F16_e64;
+  case AMDGPU::V_PK_MUL_F16:
+    return AMDGPU::V_MUL_F16_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
   }
 }
 
 SmallVector<MachineInstr *, 2>
 GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
-                                                   unsigned SGPRSrcPos) {
+                                                 unsigned SGPRSrcPos) {
   SmallVector<MachineInstr *, 2> MIList;
 
   MachineBasicBlock &MBB = *I.getParent();
@@ -323,37 +324,46 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles) {
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack,
+    uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
 
   auto E = BB->end();
-  
+
   int TotalCyclesBetweenCandidates = 0;
   auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
-    TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *instrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
     if (Instr.isTerminator())
       return false;
-    
+
     if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
-    
+
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){
-        // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand
-        // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register
-        // The following sequence of instructions are issued
-        
-        // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, 
-        // then immediates are not allowed. Hence, the need to move these into vgprs
-        
+      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+          (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+        // unpacking packed F16 instructions requires multiple instructions.
+        // Instructions are issued to extract lower and higher bits for each
+        // operand Instructions are then issued for 2 unpacked instructions, and
+        // additional instructions to put them back into the original
+        // destination register The following sequence of instructions are
+        // issued
+
+        // The next two are needed to move masks into vgprs. Ideally, immediates
+        // should be used. However, if one of the source operands are
+        // sgpr/sregs, then immediates are not allowed. Hence, the need to move
+        // these into vgprs
+
         // vgpr_32 = V_MOV_B32_e32 65535
         // vgpr_32 = V_MOV_B32_e32 16
 
@@ -365,18 +375,19 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
         // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
         // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
         // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
-        
-        // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set
-        // subsequent unpacking only needs to issue the remaining instructions
-        // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity.
-        if (IsF16MaskSet) 
+
+        // we need to issue the MOV instructions above only once. Once these are
+        // issued, the IsF16MaskSet flag is set subsequent unpacking only needs
+        // to issue the remaining instructions The number of latency cycles for
+        // each instruction above is 1. It's hard coded into the code to reduce
+        // code complexity.
+        if (IsF16MaskSet)
           TotalCyclesBetweenCandidates += 7;
         else
           TotalCyclesBetweenCandidates += 9;
-      }
-      else
+      } else
         TotalCyclesBetweenCandidates += 1;
-      
+
       if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
         instrsToUnpack.insert(&Instr);
     }
@@ -385,8 +396,9 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 }
 
 SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
-    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
-    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) {
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
+    MachineOperand &HiSrcMO2, bool isVreg_64) {
 
   SmallVector<MachineInstr *, 2> MIList;
   MachineBasicBlock &MBB = *I.getParent();
@@ -404,103 +416,117 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
 
   const MCInstrDesc instrDesc = I.getDesc();
 
-  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int clampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t clampVal = I.getOperand(clampIdx).getImm();
 
-  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  int src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
   unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
   unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
 
-  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  // don't worry about abs values. Packed instructions (VOP3P) do not support
+  // them
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
   uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
   MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
-  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
   if (src0_Mods & SISrcMods::OP_SEL_0) {
     if (src0_Mods & SISrcMods::NEG) {
       Lo_src0_mods |= SISrcMods::NEG;
     }
-    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0
-  }
-  else {
-    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0,
+                     Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
+                                  // reg:sub0_sub1
   }
-
   if (src1_Mods & SISrcMods::OP_SEL_0) {
     if (src1_Mods & SISrcMods::NEG) {
       Lo_src1_mods |= SISrcMods::NEG;
     }
-    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0
-  }
-  else {
-    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-  }
-  Op0L_Op1L.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  Op0L_Op1L.addImm(0); //omod
+    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0,
+                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
+  }
+  Op0L_Op1L.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0L_Op1L.addImm(0); // omod
 
   if (isVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
-  }
-  else if (I.getOperand(0).isUndef()){
+  } else if (I.getOperand(0).isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
 
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
 
-  SrcSubIdx1 =
-      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-  SrcSubIdx2 =
-      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-  DestSubIdx =
-      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
 
-  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  // don't worry about abs values. Packed instructions (VOP3P) do not support
+  // them
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
   MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
-  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
   if (src0_Mods & SISrcMods::OP_SEL_1) {
     if (src0_Mods & SISrcMods::NEG_HI) {
       Hi_src0_mods |= SISrcMods::NEG;
     }
-    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0
-  }
-  else {
-    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0,
+                     Src0SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
   }
 
   if (src1_Mods & SISrcMods::OP_SEL_1) {
     if (src1_Mods & SISrcMods::NEG_HI) {
       Hi_src1_mods |= SISrcMods::NEG;
     }
-    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0
-  }
-  else {
-    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-  }
-  Op0H_Op1H.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  Op0H_Op1H.addImm(0); //omod
+    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0,
+                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
+  }
+  Op0H_Op1H.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0H_Op1H.addImm(0); // omod
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
@@ -524,16 +550,15 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg1 = I.getOperand(2).getReg();
   Register SrcReg2 = I.getOperand(4).getReg();
-
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
 
-  MachineBasicBlock::iterator MII = I;
   const DebugLoc &DL = I.getDebugLoc();
   const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
   const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
   const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+
   const TargetRegisterClass *Src0SubRC =
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
@@ -545,34 +570,38 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
     MachineInstr *CopySGPR2 = copyInstrs[1];
 
     bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-        I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-        CopySGPR2->getOperand(0), isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+    SmallVector<MachineInstr *, 2> unpackedInstrs =
+        insertUnpackedMI(I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+                         CopySGPR2->getOperand(0), isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(
+        unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(
+        unpackedInstrs[1]->getOperand(2).getReg(), TRI);
     return;
-  }
-  else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
+  } else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
     SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
     MachineInstr *CopySGPR1 = copyInstrs[0];
     MachineInstr *CopySGPR2 = copyInstrs[1];
 
     bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-        I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+    SmallVector<MachineInstr *, 2> unpackedInstrs =
+        insertUnpackedMI(I, DstMO, CopySGPR1->getOperand(0), SrcMO2,
+                         CopySGPR2->getOperand(0), SrcMO2, isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(
+        unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(
+        unpackedInstrs[1]->getOperand(1).getReg(), TRI);
     return;
   }
 
   bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-  SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-          I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-          SrcMO2, isVReg64);
+  SmallVector<MachineInstr *, 2> unpackedInstrs =
+      insertUnpackedMI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, isVReg64);
   return;
 }
 
-void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) {
+void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
+                                                    uint16_t AvailableBudget) {
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -585,7 +614,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   Register SrcReg1 = SrcMO1.getReg();
 
   const DebugLoc &DL = I.getDebugLoc();
-  
+
   const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
   auto SchedModel = TII->getSchedModel();
 
@@ -595,24 +624,25 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
     Register ImmReg = MRI.createVirtualRegister(RC);
     auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
-        .addImm(Val);
+                     .addImm(Val);
     LIS->InsertMachineInstrInMaps(*newMI);
-    const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI);
-    uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *SchedClassDesc =
+        SchedModel.resolveSchedClass(newMI);
+    uint16_t LatencyCycles =
+        SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
     return {ImmReg, LatencyCycles};
   };
 
-
   if (!IsF16MaskSet) {
     std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
-    MaskLo = RegAndLatency.first; //mask for lower 16 bits
+    MaskLo = RegAndLatency.first; // mask for lower 16 bits
     AddlCyclesConsumed += RegAndLatency.second;
     RegAndLatency = BuildImm(16);
-    ShiftAmt =  RegAndLatency.first; //mask for higher 16 bits
+    ShiftAmt = RegAndLatency.first; // mask for higher 16 bits
     AddlCyclesConsumed += RegAndLatency.second;
     IsF16MaskSet = true;
   }
-  
+
   Register Src0_Lo = MRI.createVirtualRegister(RC);
   Register Src1_Lo = MRI.createVirtualRegister(RC);
   Register Src0_Hi = MRI.createVirtualRegister(RC);
@@ -624,27 +654,33 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   if (SrcMO0.getSubReg())
     SubRegID = SrcMO0.getSubReg();
 
-  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  int src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
   unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
   unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
-  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int clampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t clampVal = I.getOperand(clampIdx).getImm();
 
   // handle op_sel for src0
   if (src0_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
-        .addReg(ShiftAmt);
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
+            .addReg(ShiftAmt);
     if (SubRegID)
       LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
       LoInput0_MI.addReg(SrcReg0);
     LIS->InsertMachineInstrInMaps(*LoInput0_MI);
-  }
-  else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
+  } else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
     if (SubRegID)
       LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
@@ -655,18 +691,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
 
   // handle op_sel_hi for src0
   if (src0_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
-        .addReg(ShiftAmt);
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder HiInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
+            .addReg(ShiftAmt);
     if (SubRegID)
       HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
       HiInput0_MI.addReg(SrcReg0);
     LIS->InsertMachineInstrInMaps(*HiInput0_MI);
-  }
-  else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
+  } else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
+    // of new vgpr
+    MachineInstrBuilder HiInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
     if (SubRegID)
       HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
@@ -680,18 +719,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
     SubRegID = SrcMO1.getSubReg();
   // handle op_sel for src1
   if (src1_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
-        .addReg(ShiftAmt);
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
+            .addReg(ShiftAmt);
     if (SubRegID)
       LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
       LoInput1_MI.addReg(SrcReg1);
     LIS->InsertMachineInstrInMaps(*LoInput1_MI);
-  }
-  else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
+  } else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
     if (SubRegID)
       LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
@@ -702,18 +744,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
 
   // handle op_sel_hi for src1
   if (src1_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
-        .addReg(ShiftAmt);
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder HiInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
+            .addReg(ShiftAmt);
     if (SubRegID)
       HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
       HiInput1_MI.addReg(SrcReg1);
     LIS->InsertMachineInstrInMaps(*HiInput1_MI);
-  }
-  else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
+  } else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
+    // of new vgpr
+    MachineInstrBuilder HiInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
     if (SubRegID)
       HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
@@ -728,75 +773,81 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
   uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
-  
+
   // Unpacked instructions
-  MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
+  MachineInstrBuilder LoMul_MI =
+      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
 
-  if (src0_Mods & SISrcMods::NEG) 
+  if (src0_Mods & SISrcMods::NEG)
     Lo_src0_mods |= SISrcMods::NEG;
 
-  LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers
-  LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0
+  LoMul_MI.addImm(Lo_src0_mods);            // src0_modifiers
+  LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0
 
   if (src1_Mods & SISrcMods::NEG)
     Lo_src1_mods |= SISrcMods::NEG;
 
-  LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers
-  LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1
-  LoMul_MI.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  LoMul_MI.addImm(0); //omod
+  LoMul_MI.addImm(Lo_src1_mods);            // src1_modifiers
+  LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1
+  LoMul_MI.addImm(clampVal);                // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  LoMul_MI.addImm(0); // omod
 
-  // unpacked instruction with VOP3 encoding for Hi bits 
+  // unpacked instruction with VOP3 encoding for Hi bits
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
-  if (src0_Mods & SISrcMods::NEG_HI) 
+  MachineInstrBuilder HiMul_MI =
+      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
+  if (src0_Mods & SISrcMods::NEG_HI)
     Hi_src0_mods |= SISrcMods::NEG_HI;
-  
-  HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers
-  HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set
+
+  HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers
+  HiMul_MI.addReg(Src0_Hi,
+                  RegState::Kill); // select higher 16 bits if op_sel_hi is set
 
   if (src1_Mods & SISrcMods::NEG_HI)
     Hi_src1_mods |= SISrcMods::NEG_HI;
-  
-  HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers
-  HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set
-  HiMul_MI.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  HiMul_MI.addImm(0); //omod
+
+  HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers
+  HiMul_MI.addReg(
+      Src1_Hi,
+      RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set
+  HiMul_MI.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  HiMul_MI.addImm(0); // omod
 
   // Shift HiMul left by 16
   Register HiMulShifted = MRI.createVirtualRegister(RC);
-  MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
-      .addReg(ShiftAmt)
-      .addReg(HiMul);
+  MachineInstrBuilder HiMulShifted_MI =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
+          .addReg(ShiftAmt)
+          .addReg(HiMul);
 
   SubRegID = 0;
   if (DstMO.getSubReg())
     SubRegID = DstMO.getSubReg();
   // OR LoMul | (HiMul << 16)
-  MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
+  MachineInstrBuilder RewriteBackToDst_MI =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
   if (SubRegID) {
-    if (DstMO.isUndef()){
+    if (DstMO.isUndef()) {
       RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID);
-    }
-    else {
+    } else {
       RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID);
     }
-  }
-  else {
-    if (DstMO.isUndef()){
+  } else {
+    if (DstMO.isUndef()) {
       RewriteBackToDst_MI.addDef(DstReg, RegState::Undef);
-    }
-    else {
+    } else {
       RewriteBackToDst_MI.addDef(DstReg);
     }
   }
   RewriteBackToDst_MI.addReg(LoMul);
   RewriteBackToDst_MI.addReg(HiMulShifted);
-  
+
   LIS->InsertMachineInstrInMaps(*LoMul_MI);
   LIS->InsertMachineInstrInMaps(*HiMul_MI);
   LIS->InsertMachineInstrInMaps(*HiMulShifted_MI);
@@ -805,7 +856,6 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   I.eraseFromParent();
   LIS->removeInterval(DstReg);
   LIS->createAndComputeVirtRegInterval(DstReg);
-  
 }
 
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
@@ -844,19 +894,22 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   }
 
   // Add RA hints to improve True16 COPY elimination.
-  // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to
+  // co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
     SetVector<MachineInstr *> instrsToUnpack;
     IsF16MaskSet = false;
     uint16_t NumMFMACycles = 0;
     auto SchedModel = TII->getSchedModel();
     for (MachineInstr &MI : MBB) {
-      if (SIInstrInfo::isMFMA(MI)){
-        const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
-        NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+      if (SIInstrInfo::isMFMA(MI)) {
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
         createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
-      if (ST.useRealTrue16Insts()){
+      if (ST.useRealTrue16Insts()) {
         if (MI.getOpcode() != AMDGPU::COPY)
           continue;
         Register Dst = MI.getOperand(0).getReg();
@@ -883,13 +936,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
           MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
     }
-    
+
     if (!instrsToUnpack.empty()) {
       for (MachineInstr *MI : instrsToUnpack) {
-        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+            (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
           processF16Unpacking(*MI, NumMFMACycles);
-        }
-        else {
+        } else {
           insertMI(*MI);
         }
       }
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
new file mode 100644
index 0000000000000..b13f61a963ed5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
@@ -0,0 +1,209 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            test_pk_mul_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_pk_mul_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 12, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_hi_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+    ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_only_overlapped_unpacking_f16
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+    ; GCN-LABEL: name: test_only_overlapped_unpacking_f16
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: dead [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+    ; GCN-NEXT: early-clobber %6:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM1]]
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub7
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %4.sub6
+    ; GCN-NEXT: undef [[V_PK_MUL_F16_:%[0-9]+]].sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %12:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 %6.sub6, [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], %6.sub6, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[COPY4]], [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_AND_B32_e32_]], 0, killed [[V_AND_B32_e32_1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_LSHRREV_B32_e64_]], 0, killed [[V_LSHRREV_B32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_MUL_F16_e64_1]], implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]].sub2:vreg_128_align2 = V_OR_B32_e64 [[V_MUL_F16_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %4.sub5
+    ; GCN-NEXT: dead [[V_PK_MUL_F16_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub5, 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub4_sub5, 0, 0
+    %23:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub6_sub7, 0, 0
+    early-clobber %25:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub6_sub7, 0, 0
+    %12:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+    early-clobber %24:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub4_sub5, 0, 0
+    %29:vreg_64_align2 = COPY %22
+    %30:vreg_64_align2 = COPY %23
+    %51:vgpr_32 = COPY %25.sub7
+    %55:vgpr_32 = COPY %25.sub6
+    undef %99.sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub7, 8, %51, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %28:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 %29, %30, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %99.sub2:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub6, 8, %55, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vgpr_32 = COPY %25.sub5
+    %99.sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub5, 8, %59, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0

>From 7acafc48136cd22d64d3c03b643075e070c97754 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Mon, 18 Aug 2025 15:21:58 -0500
Subject: [PATCH 10/16] adding gfx942 supports and code cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 199 ++++++------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   6 +-
 2 files changed, 68 insertions(+), 137 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 9a2f898dcb2de..6ec71324df84e 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -65,15 +65,14 @@ class GCNPreRAOptimizationsImpl {
 
   bool processReg(Register Reg);
   bool createListOfPackedInstr(MachineInstr &BeginMI,
-                               SetVector<MachineInstr *> &instrsToUnpack,
+                               SetVector<MachineInstr *> &InstrsToUnpack,
                                uint16_t NumMFMACycles);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
-  void insertMI(MachineInstr &I);
+  void processF32Unpacking(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
-  SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
-                                                       unsigned SGPRSrcPos);
+  
   SmallVector<MachineInstr *, 2>
-  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO,
+  insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO,
                    MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
                    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                    bool isVreg_64);
@@ -288,43 +287,8 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   }
 }
 
-SmallVector<MachineInstr *, 2>
-GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
-                                                 unsigned SGPRSrcPos) {
-  SmallVector<MachineInstr *, 2> MIList;
-
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineFunction &MF = *MBB.getParent();
-  const DebugLoc &DL = I.getDebugLoc();
-
-  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
-  MachineInstr *CopySGPR1 =
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
-          .addDef(TmpReg, RegState::Undef)
-          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0);
-  unsigned SubIdx = TRI->composeSubRegIndices(
-      AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg());
-  CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg());
-  CopySGPR1->getOperand(0).setSubReg(SubIdx);
-  LIS->InsertMachineInstrInMaps(*CopySGPR1);
-  MIList.push_back(CopySGPR1);
-
-  MachineInstr *CopySGPR2 =
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
-          .addDef(TmpReg)
-          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1);
-  SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1,
-                                     CopySGPR2->getOperand(0).getSubReg());
-  CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg());
-  CopySGPR2->getOperand(0).setSubReg(SubIdx);
-  LIS->InsertMachineInstrInMaps(*CopySGPR2);
-  MIList.push_back(CopySGPR2);
-  return MIList;
-}
-
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack,
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
     uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
@@ -336,10 +300,10 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc =
+    const MCSchedClassDesc *InstrSchedClassDesc =
         SchedModel.resolveSchedClass(&Instr);
     TotalCyclesBetweenCandidates +=
-        SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
@@ -389,16 +353,16 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
         TotalCyclesBetweenCandidates += 1;
 
       if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
-        instrsToUnpack.insert(&Instr);
+        InstrsToUnpack.insert(&Instr);
     }
   }
   return true;
 }
 
-SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
+SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
     MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
     MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
-    MachineOperand &HiSrcMO2, bool isVreg_64) {
+    MachineOperand &HiSrcMO2, bool IsVreg_64) {
 
   SmallVector<MachineInstr *, 2> MIList;
   MachineBasicBlock &MBB = *I.getParent();
@@ -414,28 +378,27 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   unsigned DestSubIdx =
       TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
 
-  const MCInstrDesc instrDesc = I.getDesc();
+  const MCInstrDesc InstrDesc = I.getDesc();
 
-  int clampIdx =
+  int ClampIdx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
-  int64_t clampVal = I.getOperand(clampIdx).getImm();
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
 
-  int src0_modifiers_Idx =
+  int Src0_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx =
+  int Src1_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
-  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
 
-  // don't worry about abs values. Packed instructions (VOP3P) do not support
-  // them
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
-  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (src0_Mods & SISrcMods::OP_SEL_0) {
-    if (src0_Mods & SISrcMods::NEG) {
+  if (Src0_Mods & SISrcMods::OP_SEL_0) {
+    if (Src0_Mods & SISrcMods::NEG) {
       Lo_src0_mods |= SISrcMods::NEG;
     }
     Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
@@ -450,8 +413,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
                      Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
                                   // reg:sub0_sub1
   }
-  if (src1_Mods & SISrcMods::OP_SEL_0) {
-    if (src1_Mods & SISrcMods::NEG) {
+  if (Src1_Mods & SISrcMods::OP_SEL_0) {
+    if (Src1_Mods & SISrcMods::NEG) {
       Lo_src1_mods |= SISrcMods::NEG;
     }
     Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
@@ -466,12 +429,12 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
                      Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
                                   // of reg:sub0_sub1
   }
-  Op0L_Op1L.addImm(clampVal); // clamp
+  Op0L_Op1L.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   Op0L_Op1L.addImm(0); // omod
 
-  if (isVreg_64) {
+  if (IsVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   } else if (I.getOperand(0).isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
@@ -483,15 +446,14 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
   DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
 
-  // don't worry about abs values. Packed instructions (VOP3P) do not support
-  // them
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (src0_Mods & SISrcMods::OP_SEL_1) {
-    if (src0_Mods & SISrcMods::NEG_HI) {
+  if (Src0_Mods & SISrcMods::OP_SEL_1) {
+    if (Src0_Mods & SISrcMods::NEG_HI) {
       Hi_src0_mods |= SISrcMods::NEG;
     }
     Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
@@ -507,8 +469,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
                                   // of reg:sub0_sub1
   }
 
-  if (src1_Mods & SISrcMods::OP_SEL_1) {
-    if (src1_Mods & SISrcMods::NEG_HI) {
+  if (Src1_Mods & SISrcMods::OP_SEL_1) {
+    if (Src1_Mods & SISrcMods::NEG_HI) {
       Hi_src1_mods |= SISrcMods::NEG;
     }
     Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
@@ -523,7 +485,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
                      Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
                                   // of reg:sub0_sub1
   }
-  Op0H_Op1H.addImm(clampVal); // clamp
+  Op0H_Op1H.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   Op0H_Op1H.addImm(0); // omod
@@ -542,7 +504,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   return MIList;
 }
 
-void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
+void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineFunction &MF = *MBB.getParent();
@@ -563,40 +525,9 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
 
-  if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
-    // try with sgpr32
-    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
-    MachineInstr *CopySGPR1 = copyInstrs[0];
-    MachineInstr *CopySGPR2 = copyInstrs[1];
-
-    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs =
-        insertUnpackedMI(I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-                         CopySGPR2->getOperand(0), isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(
-        unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(
-        unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-    return;
-  } else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
-    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
-    MachineInstr *CopySGPR1 = copyInstrs[0];
-    MachineInstr *CopySGPR2 = copyInstrs[1];
-
-    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs =
-        insertUnpackedMI(I, DstMO, CopySGPR1->getOperand(0), SrcMO2,
-                         CopySGPR2->getOperand(0), SrcMO2, isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(
-        unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(
-        unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-    return;
-  }
-
-  bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-  SmallVector<MachineInstr *, 2> unpackedInstrs =
-      insertUnpackedMI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, isVReg64);
+  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+  SmallVector<MachineInstr *, 2> UnpackedInstrs =
+      insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
   return;
 }
 
@@ -623,11 +554,11 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
 
   auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
     Register ImmReg = MRI.createVirtualRegister(RC);
-    auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
+    auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
                      .addImm(Val);
-    LIS->InsertMachineInstrInMaps(*newMI);
+    LIS->InsertMachineInstrInMaps(*NewMI);
     const MCSchedClassDesc *SchedClassDesc =
-        SchedModel.resolveSchedClass(newMI);
+        SchedModel.resolveSchedClass(NewMI);
     uint16_t LatencyCycles =
         SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
     return {ImmReg, LatencyCycles};
@@ -636,10 +567,8 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   if (!IsF16MaskSet) {
     std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
     MaskLo = RegAndLatency.first; // mask for lower 16 bits
-    AddlCyclesConsumed += RegAndLatency.second;
     RegAndLatency = BuildImm(16);
     ShiftAmt = RegAndLatency.first; // mask for higher 16 bits
-    AddlCyclesConsumed += RegAndLatency.second;
     IsF16MaskSet = true;
   }
 
@@ -654,18 +583,18 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   if (SrcMO0.getSubReg())
     SubRegID = SrcMO0.getSubReg();
 
-  int src0_modifiers_Idx =
+  int Src0_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx =
+  int Src1_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
-  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
-  int clampIdx =
+  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+  int ClampIdx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
-  int64_t clampVal = I.getOperand(clampIdx).getImm();
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
 
   // handle op_sel for src0
-  if (src0_Mods & SISrcMods::OP_SEL_0) {
+  if (Src0_Mods & SISrcMods::OP_SEL_0) {
     // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
     // new vgpr
     MachineInstrBuilder LoInput0_MI =
@@ -690,7 +619,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   }
 
   // handle op_sel_hi for src0
-  if (src0_Mods & SISrcMods::OP_SEL_1) {
+  if (Src0_Mods & SISrcMods::OP_SEL_1) {
     // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
     // new vgpr
     MachineInstrBuilder HiInput0_MI =
@@ -718,7 +647,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   if (SrcMO0.getSubReg())
     SubRegID = SrcMO1.getSubReg();
   // handle op_sel for src1
-  if (src1_Mods & SISrcMods::OP_SEL_0) {
+  if (Src1_Mods & SISrcMods::OP_SEL_0) {
     // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
     // new vgpr
     MachineInstrBuilder LoInput1_MI =
@@ -743,7 +672,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   }
 
   // handle op_sel_hi for src1
-  if (src1_Mods & SISrcMods::OP_SEL_1) {
+  if (Src1_Mods & SISrcMods::OP_SEL_1) {
     // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
     // new vgpr
     MachineInstrBuilder HiInput1_MI =
@@ -772,24 +701,24 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
 
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
-  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
 
   // Unpacked instructions
   MachineInstrBuilder LoMul_MI =
-      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
+      BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul);
 
-  if (src0_Mods & SISrcMods::NEG)
+  if (Src0_Mods & SISrcMods::NEG)
     Lo_src0_mods |= SISrcMods::NEG;
 
   LoMul_MI.addImm(Lo_src0_mods);            // src0_modifiers
   LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0
 
-  if (src1_Mods & SISrcMods::NEG)
+  if (Src1_Mods & SISrcMods::NEG)
     Lo_src1_mods |= SISrcMods::NEG;
 
   LoMul_MI.addImm(Lo_src1_mods);            // src1_modifiers
   LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1
-  LoMul_MI.addImm(clampVal);                // clamp
+  LoMul_MI.addImm(ClampVal);                // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   LoMul_MI.addImm(0); // omod
@@ -799,22 +728,22 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   unsigned Hi_src1_mods = 0;
 
   MachineInstrBuilder HiMul_MI =
-      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
-  if (src0_Mods & SISrcMods::NEG_HI)
+      BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), HiMul);
+  if (Src0_Mods & SISrcMods::NEG_HI)
     Hi_src0_mods |= SISrcMods::NEG_HI;
 
   HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers
   HiMul_MI.addReg(Src0_Hi,
                   RegState::Kill); // select higher 16 bits if op_sel_hi is set
 
-  if (src1_Mods & SISrcMods::NEG_HI)
+  if (Src1_Mods & SISrcMods::NEG_HI)
     Hi_src1_mods |= SISrcMods::NEG_HI;
 
   HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers
   HiMul_MI.addReg(
       Src1_Hi,
       RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set
-  HiMul_MI.addImm(clampVal); // clamp
+  HiMul_MI.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   HiMul_MI.addImm(0); // omod
@@ -897,7 +826,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   // Unpack packed instructions to overlap MFMAs. This allows the compiler to
   // co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
-    SetVector<MachineInstr *> instrsToUnpack;
+    SetVector<MachineInstr *> InstrsToUnpack;
     IsF16MaskSet = false;
     uint16_t NumMFMACycles = 0;
     auto SchedModel = TII->getSchedModel();
@@ -907,7 +836,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
             SchedModel.resolveSchedClass(&MI);
         NumMFMACycles =
             SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-        createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
+        createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
       }
       if (ST.useRealTrue16Insts()) {
         if (MI.getOpcode() != AMDGPU::COPY)
@@ -937,13 +866,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
       }
     }
 
-    if (!instrsToUnpack.empty()) {
-      for (MachineInstr *MI : instrsToUnpack) {
+    if (!InstrsToUnpack.empty()) {
+      for (MachineInstr *MI : InstrsToUnpack) {
         if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) ||
             (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
           processF16Unpacking(*MI, NumMFMACycles);
         } else {
-          insertMI(*MI);
+          processF32Unpacking(*MI);
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5562ff590b71d..1f7cd0140b32c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6174,9 +6174,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
 
 bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
   bool IsGFX950Only = ST.hasGFX950Insts();
-  if (!IsGFX950Only)
+  bool IsGFX940Only = ST.hasGFX940Insts();
+  
+  if (!IsGFX950Only && !IsGFX940Only)
     return false;
-
+  
   if (!isVALU(MI))
     return false;
 

>From cf4cb9e17a7ecc657528dfb11d4fef2d1bd60b87 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 21 Aug 2025 12:18:26 -0500
Subject: [PATCH 11/16] adding pk_fma_f32 support and more code cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 305 ++++++++++++++----
 1 file changed, 239 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 6ec71324df84e..8721fc7ec3afc 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,16 +28,16 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
-/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and
-/// V_PK_ADD_F32) adjacent to MFMAs such that they can be co-issued. This helps
-/// with overlapping MFMA and certain vector instructions in machine schedules
-/// and is expected to improve performance.
-/// Only those packed instructions are unpacked that are overlapped by the MFMA
-/// latency. Rest should remain untouched.
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
+/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
+/// co-issued. This helps with overlapping MFMA and certain vector instructions
+/// in machine schedules and is expected to improve performance. Only those
+/// packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "GCNPreRAOptimizations.h"
+#include "AMDGPU.h"
 #include "GCNSchedStrategy.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -70,13 +70,14 @@ class GCNPreRAOptimizationsImpl {
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void processF32Unpacking(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
-  
-  SmallVector<MachineInstr *, 2>
-  insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO,
-                   MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
-                   MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
-                   bool isVreg_64);
+
+  void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO,
+                           MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+                           MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+                           bool isVreg_64);
   void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
+  void processFMAF32Unpacking(MachineInstr &I);
+
   bool IsF16MaskSet;
   Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions
   Register
@@ -261,6 +262,7 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
   case AMDGPU::V_PK_MUL_F32:
   case AMDGPU::V_PK_MUL_F16:
   case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_FMA_F32:
     return true;
 
   default:
@@ -282,6 +284,8 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
     return AMDGPU::V_ADD_F16_e64;
   case AMDGPU::V_PK_MUL_F16:
     return AMDGPU::V_MUL_F16_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
   default:
     return std::numeric_limits<uint16_t>::max();
   }
@@ -359,12 +363,11 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   return true;
 }
 
-SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
+void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
     MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
     MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
     MachineOperand &HiSrcMO2, bool IsVreg_64) {
 
-  SmallVector<MachineInstr *, 2> MIList;
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineFunction &MF = *MBB.getParent();
@@ -395,53 +398,49 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+
   MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG) {
+    Lo_src0_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
   if (Src0_Mods & SISrcMods::OP_SEL_0) {
-    if (Src0_Mods & SISrcMods::NEG) {
-      Lo_src0_mods |= SISrcMods::NEG;
-    }
-    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
     Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0
   } else {
-    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
     Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0,
                      Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
                                   // reg:sub0_sub1
   }
+  if (Src1_Mods & SISrcMods::NEG) {
+    Lo_src1_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers
   if (Src1_Mods & SISrcMods::OP_SEL_0) {
-    if (Src1_Mods & SISrcMods::NEG) {
-      Lo_src1_mods |= SISrcMods::NEG;
-    }
-    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
     Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0
   } else {
-    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0,
-                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
-                                  // of reg:sub0_sub1
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx);
   }
   Op0L_Op1L.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   Op0L_Op1L.addImm(0); // omod
 
-  if (IsVreg_64) {
-    Op0L_Op1L->getOperand(0).setIsUndef();
-  } else if (I.getOperand(0).isUndef()) {
+  if (I.getOperand(0).isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
-
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
-
   SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
   SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
   DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
@@ -449,42 +448,225 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
-
   MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG_HI) {
+    Hi_src0_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
   if (Src0_Mods & SISrcMods::OP_SEL_1) {
-    if (Src0_Mods & SISrcMods::NEG_HI) {
-      Hi_src0_mods |= SISrcMods::NEG;
-    }
-    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
     Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0
   } else {
-    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0,
-                     Src0SubIdx); // src0 //if op_sel_hi == 0, select register 0
-                                  // of reg:sub0_sub1
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx);
   }
-
+  if (Src1_Mods & SISrcMods::NEG_HI) {
+    Hi_src1_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers
   if (Src1_Mods & SISrcMods::OP_SEL_1) {
-    if (Src1_Mods & SISrcMods::NEG_HI) {
-      Hi_src1_mods |= SISrcMods::NEG;
-    }
-    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
     Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0
   } else {
-    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0,
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx);
+  }
+  Op0H_Op1H.addImm(ClampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0H_Op1H.addImm(0); // omod
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  return;
+}
+
+void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg1 = I.getOperand(2).getReg();
+  Register SrcReg2 = I.getOperand(4).getReg();
+  Register SrcReg3 = I.getOperand(6).getReg();
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO1 = I.getOperand(2);
+  MachineOperand &SrcMO2 = I.getOperand(4);
+  MachineOperand &SrcMO3 = I.getOperand(6);
+
+  const DebugLoc &DL = I.getDebugLoc();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+  const TargetRegisterClass *Src2RC = MRI.getRegClass(I.getOperand(6).getReg());
+
+  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+
+  // insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
+  unsigned SrcSubIdx1 =
+      TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx2 =
+      TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx3 =
+      TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+
+  const MCInstrDesc InstrDesc = I.getDesc();
+  int ClampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+  int Src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int Src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  int Src2_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+  unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
+
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+  unsigned Lo_src2_mods = 0;
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG) {
+    Lo_src0_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+  if (Src0_Mods & SISrcMods::OP_SEL_0) {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
+    // if op_sel == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
+  }
+
+  if (Src1_Mods & SISrcMods::NEG) {
+    Lo_src1_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers
+  if (Src1_Mods & SISrcMods::OP_SEL_0) {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(SrcMO2.getReg(), 0,
                      Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
                                   // of reg:sub0_sub1
   }
+
+  if (Src2_Mods & SISrcMods::NEG) {
+    Lo_src2_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src2_mods); // src2_modifiers
+  if (Src2_Mods & SISrcMods::OP_SEL_0) {
+    unsigned Src2SubIdx =
+        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
+  } else {
+    unsigned Src2SubIdx =
+        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
+    // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
+  }
+  Op0L_Op1L.addImm(ClampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0L_Op1L.addImm(0); // omod
+
+  if (I.getOperand(0).isUndef()) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  }
+
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+
+  SrcSubIdx1 = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx3 = TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
+  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+
+  // Packed instructions (VOP3P) do not support abs. It is safe to ignore them.
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+  unsigned Hi_src2_mods = 0;
+
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG_HI) {
+    Hi_src0_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+  if (Src0_Mods & SISrcMods::OP_SEL_1) {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
+  }
+
+  if (Src1_Mods & SISrcMods::NEG_HI) {
+    Hi_src1_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
+
+  if (Src1_Mods & SISrcMods::OP_SEL_1) {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
+    // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx);
+  }
+
+  if (Src2_Mods & SISrcMods::NEG_HI) {
+    Hi_src2_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers
+
+  if (Src2_Mods & SISrcMods::OP_SEL_1) {
+    unsigned Src2SubIdx =
+        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(SrcMO3.getReg(), 0, Src2SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers
+    unsigned Src2SubIdx =
+        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src2SubIdx);
+  }
   Op0H_Op1H.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
@@ -499,12 +681,14 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   I.eraseFromParent();
   LIS->removeInterval(DstReg);
   LIS->createAndComputeVirtRegInterval(DstReg);
-  MIList.push_back(Op0L_Op1L);
-  MIList.push_back(Op0H_Op1H);
-  return MIList;
+  return;
 }
 
 void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
+  if (I.getOpcode() == AMDGPU::V_PK_FMA_F32) {
+    processFMAF32Unpacking(I);
+    return;
+  }
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineFunction &MF = *MBB.getParent();
@@ -521,13 +705,8 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
   const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
   const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
 
-  const TargetRegisterClass *Src0SubRC =
-      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
-  const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
-
   bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-  SmallVector<MachineInstr *, 2> UnpackedInstrs =
-      insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
+  insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
   return;
 }
 
@@ -535,23 +714,17 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
                                                     uint16_t AvailableBudget) {
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO0 = I.getOperand(2);
   MachineOperand &SrcMO1 = I.getOperand(4);
-
   Register DstReg = DstMO.getReg();
   Register SrcReg0 = SrcMO0.getReg();
   Register SrcReg1 = SrcMO1.getReg();
-
   const DebugLoc &DL = I.getDebugLoc();
 
   const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
   auto SchedModel = TII->getSchedModel();
 
-  uint16_t AddlCyclesConsumed = 0;
-  SetVector<MachineInstr *> ListOfNewInstructions;
-
   auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
     Register ImmReg = MRI.createVirtualRegister(RC);
     auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
@@ -576,8 +749,6 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   Register Src1_Lo = MRI.createVirtualRegister(RC);
   Register Src0_Hi = MRI.createVirtualRegister(RC);
   Register Src1_Hi = MRI.createVirtualRegister(RC);
-  Register Input0 = MRI.createVirtualRegister(RC);
-  Register Input1 = MRI.createVirtualRegister(RC);
 
   unsigned SubRegID = 0;
   if (SrcMO0.getSubReg())
@@ -703,6 +874,8 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
   unsigned Lo_src1_mods = 0;
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
 
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
   // Unpacked instructions
   MachineInstrBuilder LoMul_MI =
       BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul);

>From a77dab90ade2b91ef036d9b072af17e0ca52e162 Mon Sep 17 00:00:00 2001
From: Akash Dutta <137309513+akadutta at users.noreply.github.com>
Date: Thu, 21 Aug 2025 12:57:14 -0500
Subject: [PATCH 12/16] fix incorrent merge

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3358f8ff1d73a..9b8fa25b88f11 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,6 +15,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
+#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"

>From c467ed5b5e89cc6715d7b3501af7bc3ce5c2ea43 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 27 Aug 2025 14:14:38 -0500
Subject: [PATCH 13/16] check dependencies with MFMA inst and code cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 402 ++++++------------
 1 file changed, 136 insertions(+), 266 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 8721fc7ec3afc..3bbed5a4d7e8a 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -77,6 +77,8 @@ class GCNPreRAOptimizationsImpl {
                            bool isVreg_64);
   void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
   void processFMAF32Unpacking(MachineInstr &I);
+  MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA);
+  bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI);
 
   bool IsF16MaskSet;
   Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions
@@ -262,9 +264,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
   case AMDGPU::V_PK_MUL_F32:
   case AMDGPU::V_PK_MUL_F16:
   case AMDGPU::V_PK_ADD_F16:
+    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg());
   case AMDGPU::V_PK_FMA_F32:
-    return true;
-
+    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && MI.getOperand(6).isReg());
   default:
     return false;
   }
@@ -291,6 +293,22 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   }
 }
 
+bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI) {
+  for (const MachineOperand &Pred_Ops: PredMI.operands()) {
+    if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) continue;
+    Register Pred_Reg = Pred_Ops.getReg();
+    if (!Pred_Reg.isValid()) continue;
+    for (const MachineOperand &Succ_Ops: SuccMI.operands()) {
+      if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) continue;
+      Register Succ_Reg = Succ_Ops.getReg();
+      if (!Succ_Reg.isValid()) continue;
+      if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
     MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
     uint16_t NumMFMACycles) {
@@ -308,6 +326,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
         SchedModel.resolveSchedClass(&Instr);
     TotalCyclesBetweenCandidates +=
         SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
     if (Instr.isMetaInstruction())
       continue;
 
@@ -318,6 +337,9 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
       return false;
 
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+      if (hasReadWriteDependencies(BeginMI, Instr)){
+        dbgs() << "## here\n";
+      }
       if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
           (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
         // unpacking packed F16 instructions requires multiple instructions.
@@ -368,126 +390,72 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
     MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
     MachineOperand &HiSrcMO2, bool IsVreg_64) {
 
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineFunction &MF = *MBB.getParent();
+  MachineBasicBlock &MBB = *I.getParent();  
   const DebugLoc &DL = I.getDebugLoc();
   Register DstReg = DstMO.getReg();
 
-  unsigned SrcSubIdx1 =
-      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
-  unsigned SrcSubIdx2 =
-      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
-  unsigned DestSubIdx =
-      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
-
-  const MCInstrDesc InstrDesc = I.getDesc();
-
-  int ClampIdx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
-  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
-
-  int Src0_modifiers_Idx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int Src1_modifiers_Idx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
-  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
-
-  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
-  unsigned Lo_src0_mods = 0;
-  unsigned Lo_src1_mods = 0;
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
     return;
 
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
-  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (Src0_Mods & SISrcMods::NEG) {
-    Lo_src0_mods |= SISrcMods::NEG;
-  }
-  Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
-  if (Src0_Mods & SISrcMods::OP_SEL_0) {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0
-  } else {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0,
-                     Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
-                                  // reg:sub0_sub1
+  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false);
+  if (IsVreg_64) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  } else if (DstMO.isUndef()) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
   }
-  if (Src1_Mods & SISrcMods::NEG) {
-    Lo_src1_mods |= SISrcMods::NEG;
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+  
+  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false);
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
   }
-  Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers
-  if (Src1_Mods & SISrcMods::OP_SEL_0) {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0
-  } else {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
-    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx);
+  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
   }
-  Op0L_Op1L.addImm(ClampVal); // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
-  // for this use case
-  Op0L_Op1L.addImm(0); // omod
 
-  if (I.getOperand(0).isUndef()) {
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  return;
+}
+
+void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  Register DstReg = I.getOperand(0).getReg();
+  const DebugLoc &DL = I.getDebugLoc();
+  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
+  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+  
+  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true);
+  if (IsVReg64)
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  else if (I.getOperand(0).isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
-  SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-  SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
 
-  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
-  unsigned Hi_src0_mods = 0;
-  unsigned Hi_src1_mods = 0;
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
-  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (Src0_Mods & SISrcMods::NEG_HI) {
-    Hi_src0_mods |= SISrcMods::NEG_HI;
-  }
-  Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
-  if (Src0_Mods & SISrcMods::OP_SEL_1) {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0
-  } else {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
-    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx);
-  }
-  if (Src1_Mods & SISrcMods::NEG_HI) {
-    Hi_src1_mods |= SISrcMods::NEG_HI;
-  }
-  Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers
-  if (Src1_Mods & SISrcMods::OP_SEL_1) {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0
-  } else {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
-    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx);
-  }
-  Op0H_Op1H.addImm(ClampVal); // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
-  // for this use case
-  Op0H_Op1H.addImm(0); // omod
+  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true);
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
     Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
     Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
   }
+  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+  }
+
   LIS->RemoveMachineInstrFromMaps(I);
   I.eraseFromParent();
   LIS->removeInterval(DstReg);
@@ -495,39 +463,15 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   return;
 }
 
-void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineFunction &MF = *MBB.getParent();
-
-  Register DstReg = I.getOperand(0).getReg();
-  Register SrcReg1 = I.getOperand(2).getReg();
-  Register SrcReg2 = I.getOperand(4).getReg();
-  Register SrcReg3 = I.getOperand(6).getReg();
+MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,  const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) {
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
-  MachineOperand &SrcMO3 = I.getOperand(6);
-
-  const DebugLoc &DL = I.getDebugLoc();
-  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
-  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
-  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
-  const TargetRegisterClass *Src2RC = MRI.getRegClass(I.getOperand(6).getReg());
-
-  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-
-  // insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
-  unsigned SrcSubIdx1 =
-      TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
-  unsigned SrcSubIdx2 =
-      TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-  unsigned SrcSubIdx3 =
-      TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
-  unsigned DestSubIdx =
-      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
-
-  const MCInstrDesc InstrDesc = I.getDesc();
+  Register DstReg = DstMO.getReg();
+  Register SrcReg1 = SrcMO1.getReg();
+  Register SrcReg2 = SrcMO2.getReg();
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg());
+  unsigned DestSubIdx = isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
   int ClampIdx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t ClampVal = I.getOperand(ClampIdx).getImm();
@@ -535,153 +479,83 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
   int Src1_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  int Src2_modifiers_Idx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+  
   unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
   unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
-  unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
-
   // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
-  unsigned Lo_src0_mods = 0;
-  unsigned Lo_src1_mods = 0;
-  unsigned Lo_src2_mods = 0;
-  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
-  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
-    return;
-
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
-  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (Src0_Mods & SISrcMods::NEG) {
-    Lo_src0_mods |= SISrcMods::NEG;
-  }
-  Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
-  if (Src0_Mods & SISrcMods::OP_SEL_0) {
+  unsigned New_Src0_Mods = 0;
+  unsigned New_Src1_Mods = 0;
+  
+  unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+  unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+  
+  MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  NewMI.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & NegModifier) {
+    New_Src0_Mods |= SISrcMods::NEG;
+  }
+  NewMI.addImm(New_Src0_Mods); // src0_modifiers
+
+  if (Src0_Mods & OpSelModifier) {
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
+    NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
   } else {
     unsigned Src0SubIdx =
         TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
     // if op_sel == 0, select register 0 of reg:sub0_sub1
-    Op0L_Op1L.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
+    NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
   }
 
-  if (Src1_Mods & SISrcMods::NEG) {
-    Lo_src1_mods |= SISrcMods::NEG;
+  if (Src1_Mods & NegModifier) {
+    New_Src1_Mods |= SISrcMods::NEG;
   }
-  Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers
-  if (Src1_Mods & SISrcMods::OP_SEL_0) {
+  NewMI.addImm(New_Src1_Mods); // src1_modifiers
+  if (Src1_Mods & OpSelModifier) {
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
+    NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
   } else {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(SrcMO2.getReg(), 0,
-                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
-                                  // of reg:sub0_sub1
-  }
-
-  if (Src2_Mods & SISrcMods::NEG) {
-    Lo_src2_mods |= SISrcMods::NEG;
-  }
-  Op0L_Op1L.addImm(Lo_src2_mods); // src2_modifiers
-  if (Src2_Mods & SISrcMods::OP_SEL_0) {
-    unsigned Src2SubIdx =
-        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
-  } else {
-    unsigned Src2SubIdx =
-        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
     // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0L_Op1L.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
-  }
-  Op0L_Op1L.addImm(ClampVal); // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
-  // for this use case
-  Op0L_Op1L.addImm(0); // omod
-
-  if (I.getOperand(0).isUndef()) {
-    Op0L_Op1L->getOperand(0).setIsUndef();
-  }
-
-  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
-
-  SrcSubIdx1 = TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
-  SrcSubIdx2 = TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
-  SrcSubIdx3 = TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
-  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
-
-  // Packed instructions (VOP3P) do not support abs. It is safe to ignore them.
-  unsigned Hi_src0_mods = 0;
-  unsigned Hi_src1_mods = 0;
-  unsigned Hi_src2_mods = 0;
-
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
-  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (Src0_Mods & SISrcMods::NEG_HI) {
-    Hi_src0_mods |= SISrcMods::NEG_HI;
-  }
-  Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
-  if (Src0_Mods & SISrcMods::OP_SEL_1) {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
-  } else {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
-    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0H_Op1H.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
-  }
-
-  if (Src1_Mods & SISrcMods::NEG_HI) {
-    Hi_src1_mods |= SISrcMods::NEG_HI;
-  }
-  Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
-
-  if (Src1_Mods & SISrcMods::OP_SEL_1) {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
-  } else {
-    Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-    // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src1SubIdx);
-  }
-
-  if (Src2_Mods & SISrcMods::NEG_HI) {
-    Hi_src2_mods |= SISrcMods::NEG_HI;
-  }
-  Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers
-
-  if (Src2_Mods & SISrcMods::OP_SEL_1) {
-    unsigned Src2SubIdx =
-        TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(SrcMO3.getReg(), 0, Src2SubIdx); // src0
-  } else {
-    Op0H_Op1H.addImm(Hi_src2_mods); // src2_modifiers
-    unsigned Src2SubIdx =
-        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    Op0H_Op1H.addReg(SrcMO2.getReg(), 0, Src2SubIdx);
+    NewMI.addReg(SrcMO2.getReg(), 0,
+                     Src1SubIdx);
+  }
+
+  if (isFMA) {
+    MachineOperand &SrcMO3 = I.getOperand(6);
+    Register SrcReg3 = SrcMO3.getReg();
+    int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+    unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
+    unsigned New_Src2_Mods = 0;
+    //If NEG or NEG_HI is true, we need to negate the corresponding 32 bit lane. 
+    // This is also true for NEG_HI as it shares the same bit position with ABS. 
+    // But packed instructions do not support ABS. Therefore, NEG_HI must
+    // be translated to NEG source modifier for the higher 32 bits.
+    // Unpacked VOP3 instructions do support ABS, therefore we need to explicitly add
+    // the NEG modifier if present in the packed instruction
+    if (Src2_Mods & NegModifier) {
+      // New_Src2_Mods |= NegModifier;
+      New_Src2_Mods |= SISrcMods::NEG;  
+    }
+    NewMI.addImm(New_Src2_Mods); // src2_modifiers
+    if (Src2_Mods & OpSelModifier) {
+      unsigned Src2SubIdx =
+          TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
+      NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
+    } else {
+      unsigned Src2SubIdx =
+          TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
+      // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+      NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
+    }
   }
-  Op0H_Op1H.addImm(ClampVal); // clamp
+  NewMI.addImm(ClampVal); // clamp
   // packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
-  Op0H_Op1H.addImm(0); // omod
-  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
-
-  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
-    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
-    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
-  }
-  LIS->RemoveMachineInstrFromMaps(I);
-  I.eraseFromParent();
-  LIS->removeInterval(DstReg);
-  LIS->createAndComputeVirtRegInterval(DstReg);
-  return;
+  NewMI.addImm(0); // omod
+  return NewMI;
 }
 
 void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
@@ -690,20 +564,13 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
     return;
   }
   MachineBasicBlock &MBB = *I.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineFunction &MF = *MBB.getParent();
-
-  Register DstReg = I.getOperand(0).getReg();
-  Register SrcReg1 = I.getOperand(2).getReg();
-  Register SrcReg2 = I.getOperand(4).getReg();
+  
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
 
   const DebugLoc &DL = I.getDebugLoc();
-  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
-  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
-  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
 
   bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
   insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
@@ -1000,6 +867,8 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   // co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
     SetVector<MachineInstr *> InstrsToUnpack;
+    SetVector<MachineOperand *> WriteOperands;
+    SetVector<MachineOperand *> ReadOperands;
     IsF16MaskSet = false;
     uint16_t NumMFMACycles = 0;
     auto SchedModel = TII->getSchedModel();
@@ -1050,5 +919,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
       }
     }
   }
+  LIS->reanalyze(MF);
   return Changed;
 }
\ No newline at end of file

>From 1bcbebaf648b3a551d1429e1206a8264fbc14d5c Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 2 Sep 2025 16:31:40 -0500
Subject: [PATCH 14/16] remove f16 support && add dependency checks

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 417 +++---------------
 ...npack-non-coissue-insts-post-scheduler.mir |  55 ---
 2 files changed, 68 insertions(+), 404 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 3bbed5a4d7e8a..e2d1fc073bd73 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,6 +34,7 @@
 /// in machine schedules and is expected to improve performance. Only those
 /// packed instructions are unpacked that are overlapped by the MFMA latency.
 /// Rest should remain untouched.
+/// TODO: Add support for F16 packed instructions
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
@@ -75,15 +76,13 @@ class GCNPreRAOptimizationsImpl {
                            MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
                            MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                            bool isVreg_64);
-  void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
   void processFMAF32Unpacking(MachineInstr &I);
-  MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA);
-  bool hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI);
-
-  bool IsF16MaskSet;
-  Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions
-  Register
-      ShiftAmt; // mask to extract higher 16 bits from F16 packed instructions
+  MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+                                       const DebugLoc &DL,
+                                       uint16_t UnpackedOpcode, bool isHiBits,
+                                       bool isFMA);
+  bool hasReadWriteDependencies(const MachineInstr &PredMI,
+                                const MachineInstr &SuccMI);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -262,11 +261,10 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
   switch (Opcode) {
   case AMDGPU::V_PK_ADD_F32:
   case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_ADD_F16:
     return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg());
   case AMDGPU::V_PK_FMA_F32:
-    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() && MI.getOperand(6).isReg());
+    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() &&
+            MI.getOperand(6).isReg());
   default:
     return false;
   }
@@ -282,10 +280,6 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
     return AMDGPU::V_ADD_F32_e64;
   case AMDGPU::V_PK_MUL_F32:
     return AMDGPU::V_MUL_F32_e64;
-  case AMDGPU::V_PK_ADD_F16:
-    return AMDGPU::V_ADD_F16_e64;
-  case AMDGPU::V_PK_MUL_F16:
-    return AMDGPU::V_MUL_F16_e64;
   case AMDGPU::V_PK_FMA_F32:
     return AMDGPU::V_FMA_F32_e64;
   default:
@@ -293,15 +287,20 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   }
 }
 
-bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI) {
-  for (const MachineOperand &Pred_Ops: PredMI.operands()) {
-    if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) continue;
+bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(
+    const MachineInstr &PredMI, const MachineInstr &SuccMI) {
+  for (const MachineOperand &Pred_Ops : PredMI.operands()) {
+    if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+      continue;
     Register Pred_Reg = Pred_Ops.getReg();
-    if (!Pred_Reg.isValid()) continue;
-    for (const MachineOperand &Succ_Ops: SuccMI.operands()) {
-      if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) continue;
+    if (!Pred_Reg.isValid())
+      continue;
+    for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
+      if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+        continue;
       Register Succ_Reg = Succ_Ops.getReg();
-      if (!Succ_Reg.isValid()) continue;
+      if (!Succ_Reg.isValid())
+        continue;
       if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
         return true;
       }
@@ -315,9 +314,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
-
   auto E = BB->end();
-
   int TotalCyclesBetweenCandidates = 0;
   auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
@@ -329,56 +326,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 
     if (Instr.isMetaInstruction())
       continue;
-
     if (Instr.isTerminator())
       return false;
-
     if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
-
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      if (hasReadWriteDependencies(BeginMI, Instr)){
-        dbgs() << "## here\n";
-      }
-      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
-          (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
-        // unpacking packed F16 instructions requires multiple instructions.
-        // Instructions are issued to extract lower and higher bits for each
-        // operand Instructions are then issued for 2 unpacked instructions, and
-        // additional instructions to put them back into the original
-        // destination register The following sequence of instructions are
-        // issued
-
-        // The next two are needed to move masks into vgprs. Ideally, immediates
-        // should be used. However, if one of the source operands are
-        // sgpr/sregs, then immediates are not allowed. Hence, the need to move
-        // these into vgprs
-
-        // vgpr_32 = V_MOV_B32_e32 65535
-        // vgpr_32 = V_MOV_B32_e32 16
-
-        // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32
-        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64
-        // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32
-        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32
-        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
-        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
-        // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
-        // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
-
-        // we need to issue the MOV instructions above only once. Once these are
-        // issued, the IsF16MaskSet flag is set subsequent unpacking only needs
-        // to issue the remaining instructions The number of latency cycles for
-        // each instruction above is 1. It's hard coded into the code to reduce
-        // code complexity.
-        if (IsF16MaskSet)
-          TotalCyclesBetweenCandidates += 7;
-        else
-          TotalCyclesBetweenCandidates += 9;
-      } else
-        TotalCyclesBetweenCandidates += 1;
-
-      if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
+      if (hasReadWriteDependencies(BeginMI, Instr))
+        return false;
+
+      // if it is a packed instruction, we should subtract it's latency from the
+      // overall latency calculation here, because the packed instruction will
+      // be removed and replaced by 2 unpacked instructions
+      TotalCyclesBetweenCandidates -=
+          SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+      // We're adding 2 to account for the extra latency added by unpacking into
+      // 2 instructions. At the time of writing, the considered unpacked
+      // instructions have latency of 1.
+      // TODO: improve latency handling of possible inserted instructions
+      TotalCyclesBetweenCandidates += 2;
+      if (!(TotalCyclesBetweenCandidates >= NumMFMACycles))
         InstrsToUnpack.insert(&Instr);
     }
   }
@@ -390,7 +356,7 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
     MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
     MachineOperand &HiSrcMO2, bool IsVreg_64) {
 
-  MachineBasicBlock &MBB = *I.getParent();  
+  MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
   Register DstReg = DstMO.getReg();
 
@@ -398,15 +364,17 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
     return;
 
-  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false);
+  MachineInstrBuilder Op0L_Op1L =
+      createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false);
   if (IsVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   } else if (DstMO.isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
-  
-  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false);
+
+  MachineInstrBuilder Op0H_Op1H =
+      createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false);
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
@@ -435,8 +403,9 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
     return;
-  
-  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true);
+
+  MachineInstrBuilder Op0L_Op1L =
+      createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true);
   if (IsVReg64)
     Op0L_Op1L->getOperand(0).setIsUndef();
   else if (I.getOperand(0).isUndef()) {
@@ -444,7 +413,8 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
   }
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
 
-  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true);
+  MachineInstrBuilder Op0H_Op1H =
+      createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true);
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
@@ -463,7 +433,9 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
   return;
 }
 
-MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,  const DebugLoc &DL, uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) {
+MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
+    MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL,
+    uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) {
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
@@ -471,7 +443,9 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc
   Register SrcReg1 = SrcMO1.getReg();
   Register SrcReg2 = SrcMO2.getReg();
   const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg());
-  unsigned DestSubIdx = isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
+               : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
   int ClampIdx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t ClampVal = I.getOperand(ClampIdx).getImm();
@@ -479,16 +453,16 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
   int Src1_modifiers_Idx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  
+
   unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
   unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
   // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
   unsigned New_Src0_Mods = 0;
   unsigned New_Src1_Mods = 0;
-  
+
   unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
   unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
-  
+
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(DstReg, 0, DestSubIdx); // vdst
   if (Src0_Mods & NegModifier) {
@@ -519,25 +493,26 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(MachineBasicBloc
     // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
     unsigned Src1SubIdx =
         TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-    NewMI.addReg(SrcMO2.getReg(), 0,
-                     Src1SubIdx);
+    NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx);
   }
 
   if (isFMA) {
     MachineOperand &SrcMO3 = I.getOperand(6);
     Register SrcReg3 = SrcMO3.getReg();
-    int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+    int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
+        I.getOpcode(), AMDGPU::OpName::src2_modifiers);
     unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
     unsigned New_Src2_Mods = 0;
-    //If NEG or NEG_HI is true, we need to negate the corresponding 32 bit lane. 
-    // This is also true for NEG_HI as it shares the same bit position with ABS. 
-    // But packed instructions do not support ABS. Therefore, NEG_HI must
-    // be translated to NEG source modifier for the higher 32 bits.
-    // Unpacked VOP3 instructions do support ABS, therefore we need to explicitly add
-    // the NEG modifier if present in the packed instruction
+    // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+    // lane.
+    //  This is also true for NEG_HI as it shares the same bit position with
+    //  ABS. But packed instructions do not support ABS. Therefore, NEG_HI must
+    //  be translated to NEG source modifier for the higher 32 bits.
+    //  Unpacked VOP3 instructions do support ABS, therefore we need to
+    //  explicitly add the NEG modifier if present in the packed instruction
     if (Src2_Mods & NegModifier) {
       // New_Src2_Mods |= NegModifier;
-      New_Src2_Mods |= SISrcMods::NEG;  
+      New_Src2_Mods |= SISrcMods::NEG;
     }
     NewMI.addImm(New_Src2_Mods); // src2_modifiers
     if (Src2_Mods & OpSelModifier) {
@@ -564,7 +539,7 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
     return;
   }
   MachineBasicBlock &MBB = *I.getParent();
-  
+
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
@@ -577,256 +552,6 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
   return;
 }
 
-void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
-                                                    uint16_t AvailableBudget) {
-  MachineBasicBlock &MBB = *I.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  MachineOperand &DstMO = I.getOperand(0);
-  MachineOperand &SrcMO0 = I.getOperand(2);
-  MachineOperand &SrcMO1 = I.getOperand(4);
-  Register DstReg = DstMO.getReg();
-  Register SrcReg0 = SrcMO0.getReg();
-  Register SrcReg1 = SrcMO1.getReg();
-  const DebugLoc &DL = I.getDebugLoc();
-
-  const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
-  auto SchedModel = TII->getSchedModel();
-
-  auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
-    Register ImmReg = MRI.createVirtualRegister(RC);
-    auto NewMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
-                     .addImm(Val);
-    LIS->InsertMachineInstrInMaps(*NewMI);
-    const MCSchedClassDesc *SchedClassDesc =
-        SchedModel.resolveSchedClass(NewMI);
-    uint16_t LatencyCycles =
-        SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-    return {ImmReg, LatencyCycles};
-  };
-
-  if (!IsF16MaskSet) {
-    std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
-    MaskLo = RegAndLatency.first; // mask for lower 16 bits
-    RegAndLatency = BuildImm(16);
-    ShiftAmt = RegAndLatency.first; // mask for higher 16 bits
-    IsF16MaskSet = true;
-  }
-
-  Register Src0_Lo = MRI.createVirtualRegister(RC);
-  Register Src1_Lo = MRI.createVirtualRegister(RC);
-  Register Src0_Hi = MRI.createVirtualRegister(RC);
-  Register Src1_Hi = MRI.createVirtualRegister(RC);
-
-  unsigned SubRegID = 0;
-  if (SrcMO0.getSubReg())
-    SubRegID = SrcMO0.getSubReg();
-
-  int Src0_modifiers_Idx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int Src1_modifiers_Idx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
-  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
-  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
-  int ClampIdx =
-      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
-  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
-
-  // handle op_sel for src0
-  if (Src0_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder LoInput0_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
-            .addReg(ShiftAmt);
-    if (SubRegID)
-      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
-    else
-      LoInput0_MI.addReg(SrcReg0);
-    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
-  } else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder LoInput0_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
-    if (SubRegID)
-      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
-    else
-      LoInput0_MI.addReg(SrcReg0);
-    LoInput0_MI.addReg(MaskLo);
-    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
-  }
-
-  // handle op_sel_hi for src0
-  if (Src0_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder HiInput0_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
-            .addReg(ShiftAmt);
-    if (SubRegID)
-      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
-    else
-      HiInput0_MI.addReg(SrcReg0);
-    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
-  } else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
-    // of new vgpr
-    MachineInstrBuilder HiInput0_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
-    if (SubRegID)
-      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
-    else
-      HiInput0_MI.addReg(SrcReg0);
-    HiInput0_MI.addReg(MaskLo);
-    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
-  }
-
-  SubRegID = 0;
-  if (SrcMO0.getSubReg())
-    SubRegID = SrcMO1.getSubReg();
-  // handle op_sel for src1
-  if (Src1_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder LoInput1_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
-            .addReg(ShiftAmt);
-    if (SubRegID)
-      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
-    else
-      LoInput1_MI.addReg(SrcReg1);
-    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
-  } else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder LoInput1_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
-    if (SubRegID)
-      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
-    else
-      LoInput1_MI.addReg(SrcReg1);
-    LoInput1_MI.addReg(MaskLo);
-    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
-  }
-
-  // handle op_sel_hi for src1
-  if (Src1_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
-    // new vgpr
-    MachineInstrBuilder HiInput1_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
-            .addReg(ShiftAmt);
-    if (SubRegID)
-      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
-    else
-      HiInput1_MI.addReg(SrcReg1);
-    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
-  } else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
-    // of new vgpr
-    MachineInstrBuilder HiInput1_MI =
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
-    if (SubRegID)
-      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
-    else
-      HiInput1_MI.addReg(SrcReg1);
-    HiInput1_MI.addReg(MaskLo);
-    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
-  }
-
-  Register LoMul = MRI.createVirtualRegister(RC);
-  Register HiMul = MRI.createVirtualRegister(RC);
-
-  unsigned Lo_src0_mods = 0;
-  unsigned Lo_src1_mods = 0;
-  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
-
-  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
-    return;
-  // Unpacked instructions
-  MachineInstrBuilder LoMul_MI =
-      BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), LoMul);
-
-  if (Src0_Mods & SISrcMods::NEG)
-    Lo_src0_mods |= SISrcMods::NEG;
-
-  LoMul_MI.addImm(Lo_src0_mods);            // src0_modifiers
-  LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0
-
-  if (Src1_Mods & SISrcMods::NEG)
-    Lo_src1_mods |= SISrcMods::NEG;
-
-  LoMul_MI.addImm(Lo_src1_mods);            // src1_modifiers
-  LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1
-  LoMul_MI.addImm(ClampVal);                // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
-  // for this use case
-  LoMul_MI.addImm(0); // omod
-
-  // unpacked instruction with VOP3 encoding for Hi bits
-  unsigned Hi_src0_mods = 0;
-  unsigned Hi_src1_mods = 0;
-
-  MachineInstrBuilder HiMul_MI =
-      BuildMI(MBB, I, DL, TII->get(UnpackedOpcode), HiMul);
-  if (Src0_Mods & SISrcMods::NEG_HI)
-    Hi_src0_mods |= SISrcMods::NEG_HI;
-
-  HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers
-  HiMul_MI.addReg(Src0_Hi,
-                  RegState::Kill); // select higher 16 bits if op_sel_hi is set
-
-  if (Src1_Mods & SISrcMods::NEG_HI)
-    Hi_src1_mods |= SISrcMods::NEG_HI;
-
-  HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers
-  HiMul_MI.addReg(
-      Src1_Hi,
-      RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set
-  HiMul_MI.addImm(ClampVal); // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
-  // for this use case
-  HiMul_MI.addImm(0); // omod
-
-  // Shift HiMul left by 16
-  Register HiMulShifted = MRI.createVirtualRegister(RC);
-  MachineInstrBuilder HiMulShifted_MI =
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
-          .addReg(ShiftAmt)
-          .addReg(HiMul);
-
-  SubRegID = 0;
-  if (DstMO.getSubReg())
-    SubRegID = DstMO.getSubReg();
-  // OR LoMul | (HiMul << 16)
-  MachineInstrBuilder RewriteBackToDst_MI =
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
-  if (SubRegID) {
-    if (DstMO.isUndef()) {
-      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID);
-    } else {
-      RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID);
-    }
-  } else {
-    if (DstMO.isUndef()) {
-      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef);
-    } else {
-      RewriteBackToDst_MI.addDef(DstReg);
-    }
-  }
-  RewriteBackToDst_MI.addReg(LoMul);
-  RewriteBackToDst_MI.addReg(HiMulShifted);
-
-  LIS->InsertMachineInstrInMaps(*LoMul_MI);
-  LIS->InsertMachineInstrInMaps(*HiMul_MI);
-  LIS->InsertMachineInstrInMaps(*HiMulShifted_MI);
-  LIS->InsertMachineInstrInMaps(*RewriteBackToDst_MI);
-  LIS->RemoveMachineInstrFromMaps(I);
-  I.eraseFromParent();
-  LIS->removeInterval(DstReg);
-  LIS->createAndComputeVirtRegInterval(DstReg);
-}
-
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -869,7 +594,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     SetVector<MachineInstr *> InstrsToUnpack;
     SetVector<MachineOperand *> WriteOperands;
     SetVector<MachineOperand *> ReadOperands;
-    IsF16MaskSet = false;
     uint16_t NumMFMACycles = 0;
     auto SchedModel = TII->getSchedModel();
     for (MachineInstr &MI : MBB) {
@@ -910,12 +634,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
     if (!InstrsToUnpack.empty()) {
       for (MachineInstr *MI : InstrsToUnpack) {
-        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) ||
-            (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
-          processF16Unpacking(*MI, NumMFMACycles);
-        } else {
-          processF32Unpacking(*MI);
-        }
+        processF32Unpacking(*MI);
       }
     }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
index b13f61a963ed5..6b871b1d1881b 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
@@ -152,58 +152,3 @@ body:             |
     %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     S_ENDPGM 0
 
-...
----
-name:            test_only_overlapped_unpacking_f16
-tracksRegLiveness: true
-liveins:
-  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
-body:             |
-  bb.0.entry:
-    liveins: $sgpr4_sgpr5
-    ; GCN-LABEL: name: test_only_overlapped_unpacking_f16
-    ; GCN: liveins: $sgpr4_sgpr5
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
-    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub4_sub5, 0, 0
-    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub6_sub7, 0, 0
-    ; GCN-NEXT: early-clobber %4:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub6_sub7, 0, 0
-    ; GCN-NEXT: dead [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-    ; GCN-NEXT: early-clobber %6:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub4_sub5, 0, 0
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM]]
-    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM1]]
-    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub7
-    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %4.sub6
-    ; GCN-NEXT: undef [[V_PK_MUL_F16_:%[0-9]+]].sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: dead early-clobber %12:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
-    ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 %6.sub6, [[V_MOV_B32_e32_]], implicit $exec
-    ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], %6.sub6, implicit $exec
-    ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[COPY4]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GCN-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec
-    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_AND_B32_e32_]], 0, killed [[V_AND_B32_e32_1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_LSHRREV_B32_e64_]], 0, killed [[V_LSHRREV_B32_e64_1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_MUL_F16_e64_1]], implicit $exec
-    ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]].sub2:vreg_128_align2 = V_OR_B32_e64 [[V_MUL_F16_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %4.sub5
-    ; GCN-NEXT: dead [[V_PK_MUL_F16_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub5, 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0
-    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
-    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
-    %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub4_sub5, 0, 0
-    %23:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub6_sub7, 0, 0
-    early-clobber %25:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub6_sub7, 0, 0
-    %12:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-    early-clobber %24:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub4_sub5, 0, 0
-    %29:vreg_64_align2 = COPY %22
-    %30:vreg_64_align2 = COPY %23
-    %51:vgpr_32 = COPY %25.sub7
-    %55:vgpr_32 = COPY %25.sub6
-    undef %99.sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub7, 8, %51, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    %28:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 %29, %30, 0, 0, 0, 0, implicit $mode, implicit $exec
-    %99.sub2:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub6, 8, %55, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    %59:vgpr_32 = COPY %25.sub5
-    %99.sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub5, 8, %59, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-    S_ENDPGM 0

>From 8ed311eb3c7530b996e350b933a037fa409b677d Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 2 Sep 2025 19:30:43 -0500
Subject: [PATCH 15/16] code cleanup, add code comments

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e2d1fc073bd73..281208a143161 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -65,22 +65,39 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  // creates a list of packed instructions following an MFMA that are suitable
+  // for unpacking
   bool createListOfPackedInstr(MachineInstr &BeginMI,
                                SetVector<MachineInstr *> &InstrsToUnpack,
                                uint16_t NumMFMACycles);
+  // check if the machine instruction being processed is a supported packed
+  // instruction
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+  // function to perform unpacking of F32 packed instructions with 2 source
+  // operands, such as V_PK_MUL and V_PK_ADD. Currently, only V_PK_MUL and
+  // V_PK_ADD are supported for this transformation
   void processF32Unpacking(MachineInstr &I);
+  // select corresponding unpacked instruction from packed instruction as input
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
-
+  // inserts appropriate unpacked instructions into the BB
   void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO,
                            MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
                            MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                            bool isVreg_64);
+  // function to perform unpacking of F32 packed instructions with 3 source
+  // operands, such as V_PK_FMA. Currently, only V_PK_FMA is supported for this
+  // transformation
   void processFMAF32Unpacking(MachineInstr &I);
+  // creates the unpacked instruction to be inserted. Adds source modifiers to
+  // the unpacked instructions based on the source modifiers in the packed
+  // instruction
   MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
                                        const DebugLoc &DL,
                                        uint16_t UnpackedOpcode, bool isHiBits,
                                        bool isFMA);
+  // checks if there are register dependencies between those used by the MFMA
+  // instruction and the following packed instructions. Conservatively ensures
+  // that we do not incorrectly read/write registers.
   bool hasReadWriteDependencies(const MachineInstr &PredMI,
                                 const MachineInstr &SuccMI);
 
@@ -312,8 +329,6 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
     MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
     uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
-  auto *MF = BB->getParent();
-  int NumInst = 0;
   auto E = BB->end();
   int TotalCyclesBetweenCandidates = 0;
   auto SchedModel = TII->getSchedModel();
@@ -397,7 +412,7 @@ void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
   MachineBasicBlock &MBB = *I.getParent();
   Register DstReg = I.getOperand(0).getReg();
   const DebugLoc &DL = I.getDebugLoc();
-  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
   bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
@@ -440,9 +455,6 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
   Register DstReg = DstMO.getReg();
-  Register SrcReg1 = SrcMO1.getReg();
-  Register SrcReg2 = SrcMO2.getReg();
-  const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg());
   unsigned DestSubIdx =
       isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
                : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
@@ -498,7 +510,6 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
 
   if (isFMA) {
     MachineOperand &SrcMO3 = I.getOperand(6);
-    Register SrcReg3 = SrcMO3.getReg();
     int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
         I.getOpcode(), AMDGPU::OpName::src2_modifiers);
     unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
@@ -538,14 +549,12 @@ void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
     processFMAF32Unpacking(I);
     return;
   }
-  MachineBasicBlock &MBB = *I.getParent();
 
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
 
-  const DebugLoc &DL = I.getDebugLoc();
-  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg());
 
   bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
   insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);

>From 064fa84c1853e9071d660d7ed73bda597ded13a4 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 4 Sep 2025 12:54:53 -0500
Subject: [PATCH 16/16] add support for immediate operands, modularize code

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 227 ++++++------------
 1 file changed, 77 insertions(+), 150 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 281208a143161..ed52a56355486 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,7 +39,6 @@
 
 #include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
-#include "GCNSchedStrategy.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
@@ -48,8 +47,6 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 using namespace llvm;
 
@@ -65,42 +62,39 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  // creates a list of packed instructions following an MFMA that are suitable
-  // for unpacking
+  // Creates a list of packed instructions following an MFMA that are suitable
+  // for unpacking.
   bool createListOfPackedInstr(MachineInstr &BeginMI,
                                SetVector<MachineInstr *> &InstrsToUnpack,
                                uint16_t NumMFMACycles);
-  // check if the machine instruction being processed is a supported packed
+  // Check if the machine instruction being processed is a supported packed
   // instruction
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
-  // function to perform unpacking of F32 packed instructions with 2 source
-  // operands, such as V_PK_MUL and V_PK_ADD. Currently, only V_PK_MUL and
-  // V_PK_ADD are supported for this transformation
+  // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
+  // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
+  // transformation.
   void processF32Unpacking(MachineInstr &I);
-  // select corresponding unpacked instruction from packed instruction as input
+  // Select corresponding unpacked instruction from packed instruction as input
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
-  // inserts appropriate unpacked instructions into the BB
-  void insertUnpackedF32MI(MachineInstr &I, MachineOperand &DstMO,
-                           MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
-                           MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
-                           bool isVreg_64);
-  // function to perform unpacking of F32 packed instructions with 3 source
-  // operands, such as V_PK_FMA. Currently, only V_PK_FMA is supported for this
-  // transformation
-  void processFMAF32Unpacking(MachineInstr &I);
-  // creates the unpacked instruction to be inserted. Adds source modifiers to
+  // Insert appropriate unpacked instructions into the BB
+  void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA);
+  // Creates the unpacked instruction to be inserted. Adds source modifiers to
   // the unpacked instructions based on the source modifiers in the packed
   // instruction
   MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
                                        const DebugLoc &DL,
-                                       uint16_t UnpackedOpcode, bool isHiBits,
-                                       bool isFMA);
-  // checks if there are register dependencies between those used by the MFMA
+                                       uint16_t UnpackedOpcode, bool IsHiBits,
+                                       bool IsFMA);
+  // Identify register dependencies between those used by the MFMA
   // instruction and the following packed instructions. Conservatively ensures
   // that we do not incorrectly read/write registers.
   bool hasReadWriteDependencies(const MachineInstr &PredMI,
                                 const MachineInstr &SuccMI);
 
+  void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+                         unsigned NegModifier, unsigned OpSelModifier,
+                         MachineOperand &SrcMO);
+
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
   bool run(MachineFunction &MF);
@@ -278,18 +272,17 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
   switch (Opcode) {
   case AMDGPU::V_PK_ADD_F32:
   case AMDGPU::V_PK_MUL_F32:
-    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg());
   case AMDGPU::V_PK_FMA_F32:
-    return (MI.getOperand(2).isReg() && MI.getOperand(4).isReg() &&
-            MI.getOperand(6).isReg());
+    return true;
   default:
     return false;
   }
+  llvm_unreachable("Fully covered switch");
 }
 
 uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   unsigned Opcode = I.getOpcode();
-  // use 64 bit encoding to allow use of VOP3 instructions.
+  // Use 64 bit encoding to allow use of VOP3 instructions.
   // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
   // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
@@ -302,6 +295,7 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   default:
     return std::numeric_limits<uint16_t>::max();
   }
+  llvm_unreachable("Fully covered switch");
 }
 
 bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(
@@ -349,7 +343,7 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
       if (hasReadWriteDependencies(BeginMI, Instr))
         return false;
 
-      // if it is a packed instruction, we should subtract it's latency from the
+      // If it is a packed instruction, we should subtract it's latency from the
       // overall latency calculation here, because the packed instruction will
       // be removed and replaced by 2 unpacked instructions
       TotalCyclesBetweenCandidates -=
@@ -359,37 +353,32 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
       // instructions have latency of 1.
       // TODO: improve latency handling of possible inserted instructions
       TotalCyclesBetweenCandidates += 2;
-      if (!(TotalCyclesBetweenCandidates >= NumMFMACycles))
+      if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
         InstrsToUnpack.insert(&Instr);
     }
   }
   return true;
 }
 
-void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
-    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
-    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
-    MachineOperand &HiSrcMO2, bool IsVreg_64) {
-
+void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I,
+                                                    bool IsVreg_64,
+                                                    bool IsFMA) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  Register DstReg = DstMO.getReg();
+  Register DstReg = I.getOperand(0).getReg();
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
     return;
 
-  MachineInstrBuilder Op0L_Op1L =
-      createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, false);
-  if (IsVreg_64) {
+  MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
+      MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
+  if (IsVreg_64 || I.getOperand(0).isUndef())
     Op0L_Op1L->getOperand(0).setIsUndef();
-  } else if (DstMO.isUndef()) {
-    Op0L_Op1L->getOperand(0).setIsUndef();
-  }
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
 
-  MachineInstrBuilder Op0H_Op1H =
-      createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, false);
+  MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
+      MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
@@ -408,55 +397,52 @@ void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
   return;
 }
 
-void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
-  MachineBasicBlock &MBB = *I.getParent();
-  Register DstReg = I.getOperand(0).getReg();
-  const DebugLoc &DL = I.getDebugLoc();
-  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
-  bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-
-  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
-  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
-    return;
-
-  MachineInstrBuilder Op0L_Op1L =
-      createUnpackedMI(MBB, I, DL, UnpackedOpcode, false, true);
-  if (IsVReg64)
-    Op0L_Op1L->getOperand(0).setIsUndef();
-  else if (I.getOperand(0).isUndef()) {
-    Op0L_Op1L->getOperand(0).setIsUndef();
-  }
-  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
-
-  MachineInstrBuilder Op0H_Op1H =
-      createUnpackedMI(MBB, I, DL, UnpackedOpcode, true, true);
-  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
-
-  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
-    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
-    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI,
+                                                  unsigned Src_Mods,
+                                                  unsigned NegModifier,
+                                                  unsigned OpSelModifier,
+                                                  MachineOperand &SrcMO) {
+  unsigned New_Src_Mods = 0;
+  //  If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+  //  lane.
+  //  NEG_HI shares the same bit position with ABS. But packed instructions do
+  //  not support ABS. Therefore, NEG_HI must be translated to NEG source
+  //  modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+  //  ABS, therefore we need to explicitly add the NEG modifier if present in
+  //  the packed instruction
+  if (Src_Mods & NegModifier) {
+    New_Src_Mods |= SISrcMods::NEG;
   }
-  if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
-    Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
-    Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+  // Src modifiers. Only negative modifiers are added if needed. Unpacked
+  // operations do not have op_sel, therefore it must be handled explicitly as
+  // done below. Unpacked operations support abs, but packed instructions do
+  // not. Thus, abs is not handled.
+  NewMI.addImm(New_Src_Mods);
+  if (SrcMO.isImm()) {
+    NewMI.addImm(SrcMO.getImm());
+  } else {
+    if (Src_Mods & OpSelModifier) {
+      unsigned Src0SubIdx =
+          TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1);
+      NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0
+    } else {
+      unsigned Src0SubIdx =
+          TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0);
+      // If op_sel == 0, select register 0 of reg:sub0_sub1
+      NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx);
+    }
   }
-
-  LIS->RemoveMachineInstrFromMaps(I);
-  I.eraseFromParent();
-  LIS->removeInterval(DstReg);
-  LIS->createAndComputeVirtRegInterval(DstReg);
-  return;
 }
 
 MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
     MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL,
-    uint16_t UnpackedOpcode, bool isHiBits, bool isFMA) {
+    uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) {
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
   Register DstReg = DstMO.getReg();
   unsigned DestSubIdx =
-      isHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
+      IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
                : TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
   int ClampIdx =
       AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
@@ -472,92 +458,33 @@ MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
   unsigned New_Src0_Mods = 0;
   unsigned New_Src1_Mods = 0;
 
-  unsigned NegModifier = isHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
-  unsigned OpSelModifier = isHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+  unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+  unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(DstReg, 0, DestSubIdx); // vdst
-  if (Src0_Mods & NegModifier) {
-    New_Src0_Mods |= SISrcMods::NEG;
-  }
-  NewMI.addImm(New_Src0_Mods); // src0_modifiers
-
-  if (Src0_Mods & OpSelModifier) {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub1);
-    NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx); // src0
-  } else {
-    unsigned Src0SubIdx =
-        TRI->composeSubRegIndices(SrcMO1.getSubReg(), AMDGPU::sub0);
-    // if op_sel == 0, select register 0 of reg:sub0_sub1
-    NewMI.addReg(SrcMO1.getReg(), 0, Src0SubIdx);
-  }
-
-  if (Src1_Mods & NegModifier) {
-    New_Src1_Mods |= SISrcMods::NEG;
-  }
-  NewMI.addImm(New_Src1_Mods); // src1_modifiers
-  if (Src1_Mods & OpSelModifier) {
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub1);
-    NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx); // src0
-  } else {
-    // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-    unsigned Src1SubIdx =
-        TRI->composeSubRegIndices(SrcMO2.getSubReg(), AMDGPU::sub0);
-    NewMI.addReg(SrcMO2.getReg(), 0, Src1SubIdx);
-  }
+  addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
+  addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);
 
-  if (isFMA) {
+  if (IsFMA) {
     MachineOperand &SrcMO3 = I.getOperand(6);
     int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
         I.getOpcode(), AMDGPU::OpName::src2_modifiers);
     unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
-    unsigned New_Src2_Mods = 0;
-    // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
-    // lane.
-    //  This is also true for NEG_HI as it shares the same bit position with
-    //  ABS. But packed instructions do not support ABS. Therefore, NEG_HI must
-    //  be translated to NEG source modifier for the higher 32 bits.
-    //  Unpacked VOP3 instructions do support ABS, therefore we need to
-    //  explicitly add the NEG modifier if present in the packed instruction
-    if (Src2_Mods & NegModifier) {
-      // New_Src2_Mods |= NegModifier;
-      New_Src2_Mods |= SISrcMods::NEG;
-    }
-    NewMI.addImm(New_Src2_Mods); // src2_modifiers
-    if (Src2_Mods & OpSelModifier) {
-      unsigned Src2SubIdx =
-          TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub1);
-      NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
-    } else {
-      unsigned Src2SubIdx =
-          TRI->composeSubRegIndices(SrcMO3.getSubReg(), AMDGPU::sub0);
-      // if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-      NewMI.addReg(SrcMO3.getReg(), 0, Src2SubIdx);
-    }
+    addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
   }
   NewMI.addImm(ClampVal); // clamp
-  // packed instructions do not support output modifiers. safe to assign them 0
+  // Packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
   NewMI.addImm(0); // omod
   return NewMI;
 }
 
 void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
-  if (I.getOpcode() == AMDGPU::V_PK_FMA_F32) {
-    processFMAF32Unpacking(I);
-    return;
-  }
-
-  MachineOperand &DstMO = I.getOperand(0);
-  MachineOperand &SrcMO1 = I.getOperand(2);
-  MachineOperand &SrcMO2 = I.getOperand(4);
-
-  const TargetRegisterClass *DstRC = MRI->getRegClass(DstMO.getReg());
-
+  bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
+  const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
   bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-  insertUnpackedF32MI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, IsVReg64);
+  insertUnpackedF32MI(I, IsVReg64, IsFMA);
   return;
 }