[llvm] Co-issue packed instructions by unpacking (PR #151704)

Sun Aug 17 07:32:28 PDT 2025

https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/151704

>From 7c443285ec2df426ca1ff93f236fcb67d735338f Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:40:34 -0500
Subject: [PATCH 1/9] initial commit

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 409 ++++++++++++++++++
 1 file changed, 409 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..d76502d18f7e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,6 +39,21 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
+#include "AMDGPURegisterBankInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"
+#include <unordered_set>
+
+#include "GCNSchedStrategy.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -53,6 +68,17 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  bool unpackInsts(MachineFunction &MF);
+  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
+  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+  void insertMI(MachineInstr &I);
+  SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
+                                                       unsigned SGPRSrcPos);
+  SmallVector<MachineInstr *, 2>
+  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+                   MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+                   bool isVreg_64);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -62,6 +88,7 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
+  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -75,6 +102,7 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -225,10 +253,390 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
+bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
+  // if (!IsGFX942Only)
+  //   return false;
+
+  if (!SIInstrInfo::isVALU(MI)){
+    return false;
+  }
+
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (SIInstrInfo::isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (SIInstrInfo::isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (SIInstrInfo::isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_CVT_PK_BF8_F32_e64:
+  case AMDGPU::V_CVT_PK_FP8_F32_e64:
+  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+  case AMDGPU::V_MQSAD_U32_U8_e64:
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_ADD_I16:
+  case AMDGPU::V_PK_ADD_U16:
+  case AMDGPU::V_PK_ASHRREV_I16:
+  case AMDGPU::V_PK_FMA_F16:
+  case AMDGPU::V_PK_FMA_F32:
+  case AMDGPU::V_PK_FMAC_F16_e32:
+  case AMDGPU::V_PK_FMAC_F16_e64:
+  case AMDGPU::V_PK_LSHLREV_B16:
+  case AMDGPU::V_PK_LSHRREV_B16:
+  case AMDGPU::V_PK_MAD_I16:
+  case AMDGPU::V_PK_MAD_U16:
+  case AMDGPU::V_PK_MAX_F16:
+  case AMDGPU::V_PK_MAX_I16:
+  case AMDGPU::V_PK_MAX_U16:
+  case AMDGPU::V_PK_MIN_F16:
+  case AMDGPU::V_PK_MIN_I16:
+  case AMDGPU::V_PK_MIN_U16:
+  case AMDGPU::V_PK_MOV_B32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_LO_U16:
+  case AMDGPU::V_PK_SUB_I16:
+  case AMDGPU::V_PK_SUB_U16:
+  case AMDGPU::V_QSAD_PK_U16_U8_e64:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_MUL_F32:
+    return true;
+
+  default:
+    return false;
+
+  }
+}
+
+SmallVector<MachineInstr *, 2>
+GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
+                                                   unsigned SGPRSrcPos) {
+  SmallVector<MachineInstr *, 2> MIList;
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VReg_64_Align2RegClass);
+  MachineInstr *CopySGPR1 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg, RegState::Undef)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub0);
+  unsigned SubIdx = TRI->composeSubRegIndices(
+      AMDGPU::sub0, CopySGPR1->getOperand(0).getSubReg());
+  CopySGPR1->getOperand(0).setReg(CopySGPR1->getOperand(0).getReg());
+  CopySGPR1->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR1);
+  MIList.push_back(CopySGPR1);
+
+  MachineInstr *CopySGPR2 =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY))
+          .addDef(TmpReg)
+          .addReg(I.getOperand(SGPRSrcPos).getReg(), 0, AMDGPU::sub1);
+  SubIdx = TRI->composeSubRegIndices(AMDGPU::sub1,
+                                     CopySGPR2->getOperand(0).getSubReg());
+  CopySGPR2->getOperand(0).setReg(CopySGPR2->getOperand(0).getReg());
+  CopySGPR2->getOperand(0).setSubReg(SubIdx);
+  LIS->InsertMachineInstrInMaps(*CopySGPR2);
+  MIList.push_back(CopySGPR2);
+  return MIList;
+}
+
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+  auto *BB = BeginMI.getParent();
+  auto *MF = BB->getParent();
+  int NumInst = 0;
+
+  auto E = BB->end();
+  auto schedModel = TII->getSchedModel();
+  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
+  int totalCyclesBetweenCandidates = 0;
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
+    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    if (Instr.isMetaInstruction())
+      continue;
+
+    if (Instr.isTerminator())
+      return false;
+
+    if (totalCyclesBetweenCandidates > NumMFMACycles)
+      return false;
+
+    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+      totalCyclesBetweenCandidates += 1;
+      seen.insert(&Instr);
+    }
+  }
+  return true;
+}
+
+SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) {
+
+  SmallVector<MachineInstr *, 2> MIList;
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = DstMO.getReg();
+
+  unsigned SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+
+  const MCInstrDesc instrDesc = I.getDesc();
+
+  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t clampVal = I.getOperand(clampIdx).getImm();
+
+  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
+  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_0) {
+    if (src0_Mods & SISrcMods::NEG) {
+      Lo_src0_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_0) {
+    if (src1_Mods & SISrcMods::NEG) {
+      Lo_src1_mods |= SISrcMods::NEG;
+    }
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0L_Op1L.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0L_Op1L.addImm(0); //omod
+
+  if (isVreg_64) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  }
+  else {
+    if (I.getOperand(0).isUndef()) {
+      Op0L_Op1L->getOperand(0).setIsUndef();
+    }
+  }
+
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+
+  SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+
+  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
+  if (src0_Mods & SISrcMods::OP_SEL_1) {
+    if (src0_Mods & SISrcMods::NEG_HI) {
+      Hi_src0_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
+    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+
+  if (src1_Mods & SISrcMods::OP_SEL_1) {
+    if (src1_Mods & SISrcMods::NEG_HI) {
+      Hi_src1_mods |= SISrcMods::NEG;
+    }
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0
+  }
+  else {
+    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
+    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+  }
+  Op0H_Op1H.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  Op0H_Op1H.addImm(0); //omod
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  MIList.push_back(Op0L_Op1L);
+  MIList.push_back(Op0H_Op1H);
+  return MIList;
+}
+
+void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg1 = I.getOperand(2).getReg();
+  Register SrcReg2 = I.getOperand(4).getReg();
+
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO1 = I.getOperand(2);
+  MachineOperand &SrcMO2 = I.getOperand(4);
+
+  MachineBasicBlock::iterator MII = I;
+  const DebugLoc &DL = I.getDebugLoc();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
+  const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
+  const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+  const TargetRegisterClass *Src0SubRC =
+      TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+  const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
+
+  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
+      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
+    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+      // try with sgpr32
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+            CopySGPR2->getOperand(0), false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+      }
+    }
+    else {
+      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+      MachineInstr *CopySGPR1 = copyInstrs[0];
+      MachineInstr *CopySGPR2 = copyInstrs[1];
+
+      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      } else {
+        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
+        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+      }
+    }
+    return;
+  }
+
+  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, false);
+  }
+  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+            SrcMO2, true);
+  }
+  return;
+}
+
+bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
+  auto schedModel = TII->getSchedModel();
+  for (MachineBasicBlock &MBB : MF) {
+    std::unordered_set<MachineInstr *> seen;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, seen);
+      }
+
+    }
+    if (!seen.empty()) {
+      for (MachineInstr *MI : seen) 
+        insertMI(*MI);
+    }
+  }
+  return true;
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 
@@ -248,6 +656,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))

>From 4bff9657e7016452f6657f1c217e804fd354d3ae Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 15:44:14 -0500
Subject: [PATCH 2/9] add test

---
 ...unpack-non-coissue-insts-post-scheduler.ll | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll

diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
new file mode 100644
index 0000000000000..5c6d376c92e65
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
@@ -0,0 +1,116 @@
+; TODO: change variable names. Make test smaller if possible
+
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at global_smem = external addrspace(3) global [0 x i8], align 16
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.amdgcn.exp2.f32(float)
+
+; Function Attrs: nofree norecurse nounwind
+define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr {
+  %29 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+    
+  %96 = sext i32 %8 to i64
+  %97 = getelementptr half, ptr addrspace(1) %1, i64 %96
+  
+  %115 = icmp slt i32 %29, 16384
+
+  %135 = icmp slt i32 %29, 1
+  
+  %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29
+  %216 = load <8 x half>, ptr addrspace(3) %215, align 16
+  
+  %276 = shl nuw nsw i32 %29, 7
+  
+  %396 = getelementptr half, ptr addrspace(1) %97, i64 1
+  %397 = sext i32 %13 to i64
+  %398 = getelementptr half, ptr addrspace(1) %97, i64 %397
+  
+  %536 = fsub float 0xFFF0000000000000, 0.5
+  %537 = tail call float @llvm.amdgcn.exp2.f32(float %536)
+  
+  %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29
+  %539 = load <8 x half>, ptr addrspace(3) %538, align 16
+  
+  %573 = icmp ult i32 1, 511
+  br i1 %573, label %575, label %574
+
+574:                                              ; preds = %28
+  br label %575
+
+575:                                              ; preds = %574, %28
+  %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  br label %686
+
+686:                                              ; preds = %575, %686
+  %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ]
+  
+  
+  %690 = phi i32 [ 0, %575 ], [ %1120, %686 ]
+  %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ]
+  %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ]
+  
+  %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ]
+  
+  
+  %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+  %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
+
+  %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
+  
+  %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.s.setprio(i16 0)
+  %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0)
+  %879 = extractelement <16 x float> %872, i64 0
+  
+  
+  %957 = insertelement <2 x float> poison, float %.pn347561, i64 0
+  %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer
+  %959 = fmul <2 x float> %759, %958
+  %960 = fmul <2 x float> %760, %958
+  
+  %tmp1 = fmul <2 x float> %tmp6, %958
+  %tmp2 = fmul <2 x float> %tmp7, %958  
+  
+  %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  
+  %1116 = getelementptr half, ptr addrspace(1) %692, i64 1
+  %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397
+  
+  %1119 = icmp slt i32 %690, 2
+  %1120 = select i1 %1119, i32 %690, i32 0
+  %.idx359 = shl i32 %1120, 14
+  %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359
+  
+  %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  
+  %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+
+  %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0)
+  %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0)
+  
+  
+  %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879)
+  
+  %.idx367 = shl i32 %690, 14
+  %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367
+  
+  %1412 = add nuw nsw i32 0, 64
+  %1413 = icmp samesign ult i32 0, 7936
+  %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> <i32 0, i32 1>
+  
+  %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> <i32 0, i32 1>
+  
+  br i1 %1413, label %686, label %1510
+
+1510:                                             ; preds = %686
+  ret void
+}

>From d3b19c668d30e4dc906a301c13d2cf6a2e434c7a Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 29 Jul 2025 16:31:30 -0500
Subject: [PATCH 3/9] code cleanup

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index d76502d18f7e7..e2c65bf25d31c 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -39,19 +39,12 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
 
-#include "AMDGPURegisterBankInfo.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 #include <unordered_set>
 
 #include "GCNSchedStrategy.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 using namespace llvm;
@@ -88,7 +81,6 @@ class GCNPreRAOptimizationsImpl {
 class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 public:
   static char ID;
-  const MachineLoopInfo *MLI = nullptr;
 
   GCNPreRAOptimizationsLegacy() : MachineFunctionPass(ID) {
     initializeGCNPreRAOptimizationsLegacyPass(*PassRegistry::getPassRegistry());
@@ -102,7 +94,6 @@ class GCNPreRAOptimizationsLegacy : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervalsWrapperPass>();
-    AU.addRequired<MachineLoopInfoWrapperPass>();
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -636,7 +627,6 @@ bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
   LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-  MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   return GCNPreRAOptimizationsImpl(LIS).run(MF);
 }
 

>From c581612e5cd376b5ee6ef19626444dec25e077d6 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 31 Jul 2025 20:02:28 -0500
Subject: [PATCH 4/9] miscellaneous code optimizations and cleanup

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 201 ++++++------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  59 ++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   1 +
 3 files changed, 127 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e2c65bf25d31c..844fc1439099f 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,6 +28,12 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) 
+/// adjacent to MFMAs such that they can be co-issued.
+/// This helps with overlapping MFMA and certain vector instructions in machine schedules
+/// and is expected to improve performance.
+/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
 #include "GCNPreRAOptimizations.h"
@@ -38,12 +44,10 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-
+#include "llvm/ADT/DenseSet.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
-#include <unordered_set>
-
 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool unpackInsts(MachineFunction &MF);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
-  bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
+  bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
+  uint16_t mapToUnpackedOpcode(MachineInstr &I);
   SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
                                                        unsigned SGPRSrcPos);
   SmallVector<MachineInstr *, 2>
@@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
-bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
-  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-  // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
-  // if (!IsGFX942Only)
-  //   return false;
-
-  if (!SIInstrInfo::isVALU(MI)){
-    return false;
-  }
-
-
-  // V_COS, V_EXP, V_RCP, etc.
-  if (SIInstrInfo::isTRANS(MI))
-    return true;
-
-  // DOT2, DOT2C, DOT4, etc.
-  if (SIInstrInfo::isDOT(MI))
-    return true;
-
-  // MFMA, SMFMA
-  if (SIInstrInfo::isMFMA(MI))
-    return true;
-
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_CVT_PK_BF8_F32_e64:
-  case AMDGPU::V_CVT_PK_FP8_F32_e64:
-  case AMDGPU::V_MQSAD_PK_U16_U8_e64:
-  case AMDGPU::V_MQSAD_U32_U8_e64:
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_ADD_I16:
-  case AMDGPU::V_PK_ADD_U16:
-  case AMDGPU::V_PK_ASHRREV_I16:
-  case AMDGPU::V_PK_FMA_F16:
-  case AMDGPU::V_PK_FMA_F32:
-  case AMDGPU::V_PK_FMAC_F16_e32:
-  case AMDGPU::V_PK_FMAC_F16_e64:
-  case AMDGPU::V_PK_LSHLREV_B16:
-  case AMDGPU::V_PK_LSHRREV_B16:
-  case AMDGPU::V_PK_MAD_I16:
-  case AMDGPU::V_PK_MAD_U16:
-  case AMDGPU::V_PK_MAX_F16:
-  case AMDGPU::V_PK_MAX_I16:
-  case AMDGPU::V_PK_MAX_U16:
-  case AMDGPU::V_PK_MIN_F16:
-  case AMDGPU::V_PK_MIN_I16:
-  case AMDGPU::V_PK_MIN_U16:
-  case AMDGPU::V_PK_MOV_B32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_MUL_LO_U16:
-  case AMDGPU::V_PK_SUB_I16:
-  case AMDGPU::V_PK_SUB_U16:
-  case AMDGPU::V_QSAD_PK_U16_U8_e64:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_MUL_F32:
+      return true;
+
+    default:
+      return false;
 
   }
 }
 
-bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  unsigned Opcode = MI.getOpcode();
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
   switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F16:
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F16:
-  case AMDGPU::V_PK_MUL_F32:
-    return true;
-
-  default:
-    return false;
+    case AMDGPU::V_PK_ADD_F32:
+      return AMDGPU::V_ADD_F32_e64;
+    case AMDGPU::V_PK_MUL_F32:
+      return AMDGPU::V_MUL_F32_e64;
+    default:
+      return std::numeric_limits<uint16_t>::max();
 
   }
 }
@@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
+    MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
@@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 
     if (Instr.isTerminator())
       return false;
-
+    
     if (totalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
-    if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
       totalCyclesBetweenCandidates += 1;
-      seen.insert(&Instr);
+      instrsToUnpack.insert(&Instr);
     }
   }
   return true;
@@ -420,8 +371,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   //don't worry about abs values. Packed instructions (VOP3P) do not support them
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
-
-  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_0) {
     if (src0_Mods & SISrcMods::NEG) {
@@ -476,7 +427,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
   Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
   if (src0_Mods & SISrcMods::OP_SEL_1) {
     if (src0_Mods & SISrcMods::NEG_HI) {
@@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   return;
 }
 
-bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-
-  auto schedModel = TII->getSchedModel();
-  for (MachineBasicBlock &MBB : MF) {
-    std::unordered_set<MachineInstr *> seen;
-    for (MachineInstr &MI : MBB) {
-      if (SIInstrInfo::isMFMA(MI)){
-        createListOfPackedInstr(MI, seen);
-      }
-
-    }
-    if (!seen.empty()) {
-      for (MachineInstr *MI : seen) 
-        insertMI(*MI);
-    }
-  }
-  return true;
-}
-
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
-  Changed = unpackInsts(MF);
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
@@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
-  if (!ST.useRealTrue16Insts())
-    return Changed;
-
   // Add RA hints to improve True16 COPY elimination.
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::COPY)
-        continue;
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src = MI.getOperand(1).getReg();
-      if (Dst.isVirtual() &&
-          MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          Src.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
-      if (Src.isVirtual() &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
-      if (!Dst.isVirtual() || !Src.isVirtual())
-        continue;
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
-        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+  for (MachineBasicBlock &MBB : MF) {
+    DenseSet<MachineInstr *> instrsToUnpack;
+    for (MachineInstr &MI : MBB) {
+      if (SIInstrInfo::isMFMA(MI)){
+        createListOfPackedInstr(MI, instrsToUnpack);
+      }
+      if (ST.useRealTrue16Insts()){
+        if (MI.getOpcode() != AMDGPU::COPY)
+          continue;
+        Register Dst = MI.getOperand(0).getReg();
+        Register Src = MI.getOperand(1).getReg();
+        if (Dst.isVirtual() &&
+            MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            Src.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+        if (Src.isVirtual() &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+            Dst.isPhysical() &&
+            TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+        if (!Dst.isVirtual() || !Src.isVirtual())
+          continue;
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+          MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+        }
+        if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
+            MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+          MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+    }
+    
+    if (!instrsToUnpack.empty()) {
+      for (MachineInstr *MI : instrsToUnpack) 
+        insertMI(*MI);
     }
   }
-
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..5562ff590b71d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "GCNHazardRecognizer.h"
-#include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -6173,6 +6172,64 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   return isImmOperandLegal(MI, OpIdx, *MO);
 }
 
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+  bool IsGFX950Only = ST.hasGFX950Insts();
+  if (!IsGFX950Only)
+    return false;
+
+  if (!isVALU(MI))
+    return false;
+
+  // V_COS, V_EXP, V_RCP, etc.
+  if (isTRANS(MI))
+    return true;
+
+  // DOT2, DOT2C, DOT4, etc.
+  if (isDOT(MI))
+    return true;
+
+  // MFMA, SMFMA
+  if (isMFMA(MI))
+    return true;
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    case AMDGPU::V_CVT_PK_BF8_F32_e64:
+    case AMDGPU::V_CVT_PK_FP8_F32_e64:
+    case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+    case AMDGPU::V_MQSAD_U32_U8_e64:
+    case AMDGPU::V_PK_ADD_F16:
+    case AMDGPU::V_PK_ADD_F32:
+    case AMDGPU::V_PK_ADD_I16:
+    case AMDGPU::V_PK_ADD_U16:
+    case AMDGPU::V_PK_ASHRREV_I16:
+    case AMDGPU::V_PK_FMA_F16:
+    case AMDGPU::V_PK_FMA_F32:
+    case AMDGPU::V_PK_FMAC_F16_e32:
+    case AMDGPU::V_PK_FMAC_F16_e64:
+    case AMDGPU::V_PK_LSHLREV_B16:
+    case AMDGPU::V_PK_LSHRREV_B16:
+    case AMDGPU::V_PK_MAD_I16:
+    case AMDGPU::V_PK_MAD_U16:
+    case AMDGPU::V_PK_MAX_F16:
+    case AMDGPU::V_PK_MAX_I16:
+    case AMDGPU::V_PK_MAX_U16:
+    case AMDGPU::V_PK_MIN_F16:
+    case AMDGPU::V_PK_MIN_I16:
+    case AMDGPU::V_PK_MIN_U16:
+    case AMDGPU::V_PK_MOV_B32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_LO_U16:
+    case AMDGPU::V_PK_SUB_I16:
+    case AMDGPU::V_PK_SUB_U16:
+    case AMDGPU::V_QSAD_PK_U16_U8_e64:
+      return true;
+    default:
+      return false;
+    }
+}
+
 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e042b59eb0f04..b7a0388470279 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1178,6 +1178,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
 
+  bool isNeverCoissue(MachineInstr &MI) const;
   /// Return true if this 64-bit VALU instruction has a 32-bit encoding.
   /// This function will return false if you pass it a 32-bit instruction.
   bool hasVALU32BitEncoding(unsigned Opcode) const;

>From c695b99ddae061127e015daf523b8eeec7888b71 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 1 Aug 2025 09:14:29 -0500
Subject: [PATCH 5/9] add code comments

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 844fc1439099f..0f7009a6ea394 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -262,6 +262,9 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons
 
 uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   unsigned Opcode = I.getOpcode();
+  // use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
     case AMDGPU::V_PK_ADD_F32:
       return AMDGPU::V_ADD_F32_e64;

>From 1a51a42d4c633cd1a1a84878b2a3dce6764473b4 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 6 Aug 2025 16:24:08 -0500
Subject: [PATCH 6/9] removing repetitive code, capitalize vars

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 95 +++++++------------
 1 file changed, 36 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0f7009a6ea394..f56d73e990269 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -318,25 +318,25 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
   int NumInst = 0;
 
   auto E = BB->end();
-  auto schedModel = TII->getSchedModel();
-  const MCSchedClassDesc *schedClassDesc = schedModel.resolveSchedClass(&BeginMI);
-  const int NumMFMACycles = schedModel.getWriteProcResBegin(schedClassDesc)->ReleaseAtCycle;
-  int totalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI);
+  const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+  int TotalCyclesBetweenCandidates = 0;
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc = schedModel.resolveSchedClass(&Instr);
-    totalCyclesBetweenCandidates += schedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
     if (Instr.isTerminator())
       return false;
     
-    if (totalCyclesBetweenCandidates > NumMFMACycles)
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
 
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      totalCyclesBetweenCandidates += 1;
+      TotalCyclesBetweenCandidates += 1;
       instrsToUnpack.insert(&Instr);
     }
   }
@@ -411,10 +411,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
   if (isVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
-  else {
-    if (I.getOperand(0).isUndef()) {
-      Op0L_Op1L->getOperand(0).setIsUndef();
-    }
+  else if (I.getOperand(0).isUndef()){
+    Op0L_Op1L->getOperand(0).setIsUndef();
   }
 
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
@@ -499,58 +497,37 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
 
-  if ((Src1RC->getID() == AMDGPU::SGPR_64RegClassID) ||
-      (Src0RC->getID() == AMDGPU::SGPR_64RegClassID)) {
-    if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
-      // try with sgpr32
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-            CopySGPR2->getOperand(0), false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
-      }
-    }
-    else {
-      SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
-      MachineInstr *CopySGPR1 = copyInstrs[0];
-      MachineInstr *CopySGPR2 = copyInstrs[1];
-
-      if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, true);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      } else {
-        SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, false);
-        unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-        unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
-      }
-    }
-    return;
-  }
+  if (Src1RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    // try with sgpr32
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 4);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
 
-  if (DstRC->getID() == AMDGPU::VReg_512_Align2RegClassID) {
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, false);
+        I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+        CopySGPR2->getOperand(0), isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+    return;
   }
-  else if (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID) {
+  else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
+    SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
+    MachineInstr *CopySGPR1 = copyInstrs[0];
+    MachineInstr *CopySGPR2 = copyInstrs[1];
+
+    bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
     SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-            I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-            SrcMO2, true);
+        I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+    return;
   }
+
+  bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
+  SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
+          I, DstMO, SrcMO1, SrcMO2, SrcMO1,
+          SrcMO2, isVReg64);
   return;
 }
 

>From e9056e866ab3dd91e145430e83b9603f76d8b486 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 15 Aug 2025 18:00:36 -0500
Subject: [PATCH 7/9] adding support for FP16 ops

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 316 +++++++++++++++++-
 ...unpack-non-coissue-insts-post-scheduler.ll | 116 -------
 2 files changed, 302 insertions(+), 130 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index f56d73e990269..33e07c5a16d97 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -44,13 +44,14 @@
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/InitializePasses.h"
 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -65,7 +66,7 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
+  bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
@@ -75,6 +76,10 @@ class GCNPreRAOptimizationsImpl {
   insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
                    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                    bool isVreg_64);
+  void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
+  bool IsF16MaskSet;
+  Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions
+  Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -252,6 +257,8 @@ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) cons
   switch (Opcode) {
     case AMDGPU::V_PK_ADD_F32:
     case AMDGPU::V_PK_MUL_F32:
+    case AMDGPU::V_PK_MUL_F16:
+    case AMDGPU::V_PK_ADD_F16:
       return true;
 
     default:
@@ -270,6 +277,10 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
       return AMDGPU::V_ADD_F32_e64;
     case AMDGPU::V_PK_MUL_F32:
       return AMDGPU::V_MUL_F32_e64;
+    case AMDGPU::V_PK_ADD_F16:
+      return AMDGPU::V_ADD_F16_e64;
+    case AMDGPU::V_PK_MUL_F16:
+      return AMDGPU::V_MUL_F16_e64;
     default:
       return std::numeric_limits<uint16_t>::max();
 
@@ -312,16 +323,15 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
 
   auto E = BB->end();
-  auto SchedModel = TII->getSchedModel();
-  const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&BeginMI);
-  const int NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+  
   int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
     const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
@@ -334,10 +344,41 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
     
     if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
-
+    
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      TotalCyclesBetweenCandidates += 1;
-      instrsToUnpack.insert(&Instr);
+      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){
+        // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand
+        // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register
+        // The following sequence of instructions are issued
+        
+        // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, 
+        // then immediates are not allowed. Hence, the need to move these into vgprs
+        
+        // vgpr_32 = V_MOV_B32_e32 65535
+        // vgpr_32 = V_MOV_B32_e32 16
+
+        // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64
+        // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
+        // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
+        
+        // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set
+        // subsequent unpacking only needs to issue the remaining instructions
+        // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity.
+        if (IsF16MaskSet) 
+          TotalCyclesBetweenCandidates += 7;
+        else
+          TotalCyclesBetweenCandidates += 9;
+      }
+      else
+        TotalCyclesBetweenCandidates += 1;
+      
+      if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
+        instrsToUnpack.insert(&Instr);
     }
   }
   return true;
@@ -531,6 +572,242 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   return;
 }
 
+void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  MachineOperand &DstMO = I.getOperand(0);
+  MachineOperand &SrcMO0 = I.getOperand(2);
+  MachineOperand &SrcMO1 = I.getOperand(4);
+
+  Register DstReg = DstMO.getReg();
+  Register SrcReg0 = SrcMO0.getReg();
+  Register SrcReg1 = SrcMO1.getReg();
+
+  const DebugLoc &DL = I.getDebugLoc();
+  
+  const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
+  auto SchedModel = TII->getSchedModel();
+
+  uint16_t AddlCyclesConsumed = 0;
+  SetVector<MachineInstr *> ListOfNewInstructions;
+
+  auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
+    Register ImmReg = MRI.createVirtualRegister(RC);
+    auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
+        .addImm(Val);
+    LIS->InsertMachineInstrInMaps(*newMI);
+    const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI);
+    uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+    return {ImmReg, LatencyCycles};
+  };
+
+
+  if (!IsF16MaskSet) {
+    std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
+    MaskLo = RegAndLatency.first; //mask for lower 16 bits
+    AddlCyclesConsumed += RegAndLatency.second;
+    RegAndLatency = BuildImm(16);
+    ShiftAmt =  RegAndLatency.first; //mask for higher 16 bits
+    AddlCyclesConsumed += RegAndLatency.second;
+    IsF16MaskSet = true;
+  }
+  
+  Register Src0_Lo = MRI.createVirtualRegister(RC);
+  Register Src1_Lo = MRI.createVirtualRegister(RC);
+  Register Src0_Hi = MRI.createVirtualRegister(RC);
+  Register Src1_Hi = MRI.createVirtualRegister(RC);
+  Register Input0 = MRI.createVirtualRegister(RC);
+  Register Input1 = MRI.createVirtualRegister(RC);
+
+  unsigned SubRegID = 0;
+  if (SrcMO0.getSubReg())
+    SubRegID = SrcMO0.getSubReg();
+
+  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
+  unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
+  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t clampVal = I.getOperand(clampIdx).getImm();
+
+  // handle op_sel for src0
+  if (src0_Mods & SISrcMods::OP_SEL_0) {
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      LoInput0_MI.addReg(SrcReg0);
+    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
+  }
+  else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
+    if (SubRegID)
+      LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      LoInput0_MI.addReg(SrcReg0);
+    LoInput0_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*LoInput0_MI);
+  }
+
+  // handle op_sel_hi for src0
+  if (src0_Mods & SISrcMods::OP_SEL_1) {
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      HiInput0_MI.addReg(SrcReg0);
+    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
+  }
+  else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
+    if (SubRegID)
+      HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
+    else
+      HiInput0_MI.addReg(SrcReg0);
+    HiInput0_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*HiInput0_MI);
+  }
+
+  SubRegID = 0;
+  if (SrcMO0.getSubReg())
+    SubRegID = SrcMO1.getSubReg();
+  // handle op_sel for src1
+  if (src1_Mods & SISrcMods::OP_SEL_0) {
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      LoInput1_MI.addReg(SrcReg1);
+    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
+  }
+  else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
+    if (SubRegID)
+      LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      LoInput1_MI.addReg(SrcReg1);
+    LoInput1_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*LoInput1_MI);
+  }
+
+  // handle op_sel_hi for src1
+  if (src1_Mods & SISrcMods::OP_SEL_1) {
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
+        .addReg(ShiftAmt);
+    if (SubRegID)
+      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      HiInput1_MI.addReg(SrcReg1);
+    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
+  }
+  else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
+    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
+    if (SubRegID)
+      HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
+    else
+      HiInput1_MI.addReg(SrcReg1);
+    HiInput1_MI.addReg(MaskLo);
+    LIS->InsertMachineInstrInMaps(*HiInput1_MI);
+  }
+
+  Register LoMul = MRI.createVirtualRegister(RC);
+  Register HiMul = MRI.createVirtualRegister(RC);
+
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+  uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
+  
+  // Unpacked instructions
+  MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
+
+  if (src0_Mods & SISrcMods::NEG) 
+    Lo_src0_mods |= SISrcMods::NEG;
+
+  LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers
+  LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0
+
+  if (src1_Mods & SISrcMods::NEG)
+    Lo_src1_mods |= SISrcMods::NEG;
+
+  LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers
+  LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1
+  LoMul_MI.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  LoMul_MI.addImm(0); //omod
+
+  // unpacked instruction with VOP3 encoding for Hi bits 
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+
+  MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
+  if (src0_Mods & SISrcMods::NEG_HI) 
+    Hi_src0_mods |= SISrcMods::NEG_HI;
+  
+  HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers
+  HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set
+
+  if (src1_Mods & SISrcMods::NEG_HI)
+    Hi_src1_mods |= SISrcMods::NEG_HI;
+  
+  HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers
+  HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set
+  HiMul_MI.addImm(clampVal); //clamp
+  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
+  HiMul_MI.addImm(0); //omod
+
+  // Shift HiMul left by 16
+  Register HiMulShifted = MRI.createVirtualRegister(RC);
+  MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
+      .addReg(ShiftAmt)
+      .addReg(HiMul);
+
+  SubRegID = 0;
+  if (DstMO.getSubReg())
+    SubRegID = DstMO.getSubReg();
+  // OR LoMul | (HiMul << 16)
+  MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
+  if (SubRegID) {
+    if (DstMO.isUndef()){
+      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID);
+    }
+    else {
+      RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID);
+    }
+  }
+  else {
+    if (DstMO.isUndef()){
+      RewriteBackToDst_MI.addDef(DstReg, RegState::Undef);
+    }
+    else {
+      RewriteBackToDst_MI.addDef(DstReg);
+    }
+  }
+  RewriteBackToDst_MI.addReg(LoMul);
+  RewriteBackToDst_MI.addReg(HiMulShifted);
+  
+  LIS->InsertMachineInstrInMaps(*LoMul_MI);
+  LIS->InsertMachineInstrInMaps(*HiMul_MI);
+  LIS->InsertMachineInstrInMaps(*HiMulShifted_MI);
+  LIS->InsertMachineInstrInMaps(*RewriteBackToDst_MI);
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  
+}
+
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -569,10 +846,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   // Add RA hints to improve True16 COPY elimination.
   // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
-    DenseSet<MachineInstr *> instrsToUnpack;
+    SetVector<MachineInstr *> instrsToUnpack;
+    IsF16MaskSet = false;
+    uint16_t NumMFMACycles = 0;
+    auto SchedModel = TII->getSchedModel();
     for (MachineInstr &MI : MBB) {
       if (SIInstrInfo::isMFMA(MI)){
-        createListOfPackedInstr(MI, instrsToUnpack);
+        const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
+        NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
       if (ST.useRealTrue16Insts()){
         if (MI.getOpcode() != AMDGPU::COPY)
@@ -603,9 +885,15 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     }
     
     if (!instrsToUnpack.empty()) {
-      for (MachineInstr *MI : instrsToUnpack) 
-        insertMI(*MI);
+      for (MachineInstr *MI : instrsToUnpack) {
+        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+          processF16Unpacking(*MI, NumMFMACycles);
+        }
+        else {
+          insertMI(*MI);
+        }
+      }
     }
   }
   return Changed;
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
deleted file mode 100644
index 5c6d376c92e65..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.ll
+++ /dev/null
@@ -1,116 +0,0 @@
-; TODO: change variable names. Make test smaller if possible
-
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
- at global_smem = external addrspace(3) global [0 x i8], align 16
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare float @llvm.amdgcn.exp2.f32(float)
-
-; Function Attrs: nofree norecurse nounwind
-define amdgpu_kernel void @attn_fwd(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg readonly captures(none) %1, ptr addrspace(1) inreg readonly captures(none) %2, ptr addrspace(1) inreg writeonly captures(none) %3, ptr addrspace(1) inreg writeonly captures(none) %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %13, i32 inreg %14, i32 inreg %15, i32 inreg %16, i32 inreg %17, i32 inreg %18, i32 inreg %19, i32 inreg %20, i32 inreg %21, i32 inreg %22, float inreg %23, i32 inreg %24, ptr addrspace(1) inreg readnone captures(none) %25, i32 inreg %26, ptr addrspace(1) inreg readnone captures(none) %27) local_unnamed_addr {
-  %29 = tail call i32 @llvm.amdgcn.workgroup.id.x()
-    
-  %96 = sext i32 %8 to i64
-  %97 = getelementptr half, ptr addrspace(1) %1, i64 %96
-  
-  %115 = icmp slt i32 %29, 16384
-
-  %135 = icmp slt i32 %29, 1
-  
-  %215 = getelementptr half, ptr addrspace(3) @global_smem, i32 %29
-  %216 = load <8 x half>, ptr addrspace(3) %215, align 16
-  
-  %276 = shl nuw nsw i32 %29, 7
-  
-  %396 = getelementptr half, ptr addrspace(1) %97, i64 1
-  %397 = sext i32 %13 to i64
-  %398 = getelementptr half, ptr addrspace(1) %97, i64 %397
-  
-  %536 = fsub float 0xFFF0000000000000, 0.5
-  %537 = tail call float @llvm.amdgcn.exp2.f32(float %536)
-  
-  %538 = getelementptr half, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 16384), i32 %29
-  %539 = load <8 x half>, ptr addrspace(3) %538, align 16
-  
-  %573 = icmp ult i32 1, 511
-  br i1 %573, label %575, label %574
-
-574:                                              ; preds = %28
-  br label %575
-
-575:                                              ; preds = %574, %28
-  %610 = shufflevector <8 x half> %539, <8 x half> poison, <2 x i32> <i32 0, i32 1>
-  
-  br label %686
-
-686:                                              ; preds = %575, %686
-  %.pn347561 = phi float [ %537, %575 ], [ %1329, %686 ]
-  
-  
-  %690 = phi i32 [ 0, %575 ], [ %1120, %686 ]
-  %691 = phi ptr addrspace(1) [ %398, %575 ], [ %1117, %686 ]
-  %692 = phi ptr addrspace(1) [ %396, %575 ], [ %1116, %686 ]
-  
-  %695 = phi <2 x half> [ %610, %575 ], [ %1414, %686 ]
-  
-  
-  %759 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
-  %760 = phi <2 x float> [ zeroinitializer, %575 ], [ %1478, %686 ]
-
-  %tmp6 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
-  %tmp7 = phi <2 x float> [ zeroinitializer, %575 ], [ %tmp5, %686 ]
-  
-  %871 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
-  tail call void @llvm.amdgcn.s.setprio(i16 0)
-  %872 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %216, <8 x half> %216, <16 x float> %871, i32 0, i32 0, i32 0)
-  %879 = extractelement <16 x float> %872, i64 0
-  
-  
-  %957 = insertelement <2 x float> poison, float %.pn347561, i64 0
-  %958 = shufflevector <2 x float> %957, <2 x float> poison, <2 x i32> zeroinitializer
-  %959 = fmul <2 x float> %759, %958
-  %960 = fmul <2 x float> %760, %958
-  
-  %tmp1 = fmul <2 x float> %tmp6, %958
-  %tmp2 = fmul <2 x float> %tmp7, %958  
-  
-  %1048 = shufflevector <2 x half> %695, <2 x half> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  
-  %1116 = getelementptr half, ptr addrspace(1) %692, i64 1
-  %1117 = getelementptr half, ptr addrspace(1) %691, i64 %397
-  
-  %1119 = icmp slt i32 %690, 2
-  %1120 = select i1 %1119, i32 %690, i32 0
-  %.idx359 = shl i32 %1120, 14
-  %1121 = getelementptr i8, ptr addrspace(3) @global_smem, i32 %.idx359
-  
-  %1140 = shufflevector <8 x half> %1048, <8 x half> %1048, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-  
-  %1157 = shufflevector <2 x float> %959, <2 x float> %960, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-
-  %1173 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %1157, i32 0, i32 0, i32 0)
-  %tmp4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1048, <8 x half> %1140, <16 x float> %tmp3, i32 0, i32 0, i32 0)
-  
-  
-  %1329 = tail call float @llvm.amdgcn.exp2.f32(float %879)
-  
-  %.idx367 = shl i32 %690, 14
-  %1404 = getelementptr i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 32768), i32 %.idx367
-  
-  %1412 = add nuw nsw i32 0, 64
-  %1413 = icmp samesign ult i32 0, 7936
-  %1414 = shufflevector <8 x half> %1140, <8 x half> poison, <2 x i32> <i32 0, i32 1>
-  
-  %1478 = shufflevector <16 x float> %1173, <16 x float> poison, <2 x i32> <i32 0, i32 1>
-  %tmp5 = shufflevector <16 x float> %tmp4, <16 x float> poison, <2 x i32> <i32 0, i32 1>
-  
-  br i1 %1413, label %686, label %1510
-
-1510:                                             ; preds = %686
-  ret void
-}

>From 5cb47d262a7d865e2ce9fa006e079db2676b4edb Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 15 Aug 2025 18:24:13 -0500
Subject: [PATCH 8/9] code fix

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 33e07c5a16d97..5dac4a210101e 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -854,7 +854,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
       if (SIInstrInfo::isMFMA(MI)){
         const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
         NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-        // createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
+        createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
       if (ST.useRealTrue16Insts()){
         if (MI.getOpcode() != AMDGPU::COPY)

>From 178a36354b4b109c4c59572f90205457386c77e6 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Sun, 17 Aug 2025 09:32:00 -0500
Subject: [PATCH 9/9] clang-formatted and mir tests added

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 493 ++++++++++--------
 ...npack-non-coissue-insts-post-scheduler.mir | 209 ++++++++
 2 files changed, 482 insertions(+), 220 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 5dac4a210101e..9a2f898dcb2de 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -28,30 +28,28 @@
 /// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
 /// the VGPR_32, the COPY can be completely eliminated.
 ///
-/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32) 
-/// adjacent to MFMAs such that they can be co-issued.
-/// This helps with overlapping MFMA and certain vector instructions in machine schedules
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and
+/// V_PK_ADD_F32) adjacent to MFMAs such that they can be co-issued. This helps
+/// with overlapping MFMA and certain vector instructions in machine schedules
 /// and is expected to improve performance.
-/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
-/// Rest should remain untouched.
+/// Only those packed instructions are unpacked that are overlapped by the MFMA
+/// latency. Rest should remain untouched.
 //===----------------------------------------------------------------------===//
 
-#include "GCNPreRAOptimizations.h"
 #include "AMDGPU.h"
+#include "GCNPreRAOptimizations.h"
+#include "GCNSchedStrategy.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ADT/SetVector.h"
-#include "SIInstrInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/InitializePasses.h"
-#include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include <utility>
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
@@ -66,20 +64,24 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
-  bool createListOfPackedInstr(MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles);
+  bool createListOfPackedInstr(MachineInstr &BeginMI,
+                               SetVector<MachineInstr *> &instrsToUnpack,
+                               uint16_t NumMFMACycles);
   bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   void insertMI(MachineInstr &I);
   uint16_t mapToUnpackedOpcode(MachineInstr &I);
   SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
                                                        unsigned SGPRSrcPos);
   SmallVector<MachineInstr *, 2>
-  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
-                   MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
+  insertUnpackedMI(MachineInstr &I, MachineOperand &DstMO,
+                   MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
+                   MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2,
                    bool isVreg_64);
   void processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget);
   bool IsF16MaskSet;
-  Register MaskLo; //mask to extract lower 16 bits for F16 packed instructions
-  Register ShiftAmt; //mask to extract higher 16 bits from F16 packed instructions
+  Register MaskLo; // mask to extract lower 16 bits for F16 packed instructions
+  Register
+      ShiftAmt; // mask to extract higher 16 bits from F16 packed instructions
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -252,18 +254,18 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
-bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
+    MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
-    case AMDGPU::V_PK_ADD_F32:
-    case AMDGPU::V_PK_MUL_F32:
-    case AMDGPU::V_PK_MUL_F16:
-    case AMDGPU::V_PK_ADD_F16:
-      return true;
-
-    default:
-      return false;
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_ADD_F16:
+    return true;
 
+  default:
+    return false;
   }
 }
 
@@ -273,23 +275,22 @@ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
   // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
   // e32 instructions are VOP2 and don't allow source modifiers
   switch (Opcode) {
-    case AMDGPU::V_PK_ADD_F32:
-      return AMDGPU::V_ADD_F32_e64;
-    case AMDGPU::V_PK_MUL_F32:
-      return AMDGPU::V_MUL_F32_e64;
-    case AMDGPU::V_PK_ADD_F16:
-      return AMDGPU::V_ADD_F16_e64;
-    case AMDGPU::V_PK_MUL_F16:
-      return AMDGPU::V_MUL_F16_e64;
-    default:
-      return std::numeric_limits<uint16_t>::max();
-
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_ADD_F16:
+    return AMDGPU::V_ADD_F16_e64;
+  case AMDGPU::V_PK_MUL_F16:
+    return AMDGPU::V_MUL_F16_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
   }
 }
 
 SmallVector<MachineInstr *, 2>
 GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
-                                                   unsigned SGPRSrcPos) {
+                                                 unsigned SGPRSrcPos) {
   SmallVector<MachineInstr *, 2> MIList;
 
   MachineBasicBlock &MBB = *I.getParent();
@@ -323,37 +324,46 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
 }
 
 bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
-    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack, uint16_t NumMFMACycles) {
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &instrsToUnpack,
+    uint16_t NumMFMACycles) {
   auto *BB = BeginMI.getParent();
   auto *MF = BB->getParent();
   int NumInst = 0;
 
   auto E = BB->end();
-  
+
   int TotalCyclesBetweenCandidates = 0;
   auto SchedModel = TII->getSchedModel();
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
-    const MCSchedClassDesc *instrSchedClassDesc = SchedModel.resolveSchedClass(&Instr);
-    TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *instrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(instrSchedClassDesc)->ReleaseAtCycle;
     if (Instr.isMetaInstruction())
       continue;
 
     if (Instr.isTerminator())
       return false;
-    
+
     if (TotalCyclesBetweenCandidates > NumMFMACycles)
       return false;
-    
+
     if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
-      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) || (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)){
-        // unpacking packed F16 instructions requires multiple instructions. Instructions are issued to extract lower and higher bits for each operand
-        // Instructions are then issued for 2 unpacked instructions, and additional instructions to put them back into the original destination register
-        // The following sequence of instructions are issued
-        
-        // The next two are needed to move masks into vgprs. Ideally, immediates should be used. However, if one of the source operands are sgpr/sregs, 
-        // then immediates are not allowed. Hence, the need to move these into vgprs
-        
+      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+          (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+        // unpacking packed F16 instructions requires multiple instructions.
+        // Instructions are issued to extract lower and higher bits for each
+        // operand Instructions are then issued for 2 unpacked instructions, and
+        // additional instructions to put them back into the original
+        // destination register The following sequence of instructions are
+        // issued
+
+        // The next two are needed to move masks into vgprs. Ideally, immediates
+        // should be used. However, if one of the source operands are
+        // sgpr/sregs, then immediates are not allowed. Hence, the need to move
+        // these into vgprs
+
         // vgpr_32 = V_MOV_B32_e32 65535
         // vgpr_32 = V_MOV_B32_e32 16
 
@@ -365,18 +375,19 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
         // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
         // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
         // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
-        
-        // we need to issue the MOV instructions above only once. Once these are issued, the IsF16MaskSet flag is set
-        // subsequent unpacking only needs to issue the remaining instructions
-        // The number of latency cycles for each instruction above is 1. It's hard coded into the code to reduce code complexity.
-        if (IsF16MaskSet) 
+
+        // we need to issue the MOV instructions above only once. Once these are
+        // issued, the IsF16MaskSet flag is set subsequent unpacking only needs
+        // to issue the remaining instructions The number of latency cycles for
+        // each instruction above is 1. It's hard coded into the code to reduce
+        // code complexity.
+        if (IsF16MaskSet)
           TotalCyclesBetweenCandidates += 7;
         else
           TotalCyclesBetweenCandidates += 9;
-      }
-      else
+      } else
         TotalCyclesBetweenCandidates += 1;
-      
+
       if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
         instrsToUnpack.insert(&Instr);
     }
@@ -385,8 +396,9 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
 }
 
 SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
-    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1, MachineOperand &LoSrcMO2,
-    MachineOperand &HiSrcMO1, MachineOperand &HiSrcMO2, bool isVreg_64) {
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
+    MachineOperand &HiSrcMO2, bool isVreg_64) {
 
   SmallVector<MachineInstr *, 2> MIList;
   MachineBasicBlock &MBB = *I.getParent();
@@ -404,103 +416,117 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
 
   const MCInstrDesc instrDesc = I.getDesc();
 
-  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int clampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t clampVal = I.getOperand(clampIdx).getImm();
 
-  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  int src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
   unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
   unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
 
-  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  // don't worry about abs values. Packed instructions (VOP3P) do not support
+  // them
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
   uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
   MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
-  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
   if (src0_Mods & SISrcMods::OP_SEL_0) {
     if (src0_Mods & SISrcMods::NEG) {
       Lo_src0_mods |= SISrcMods::NEG;
     }
-    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0
-  }
-  else {
-    Op0L_Op1L.addImm(Lo_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0,
+                     Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
+                                  // reg:sub0_sub1
   }
-
   if (src1_Mods & SISrcMods::OP_SEL_0) {
     if (src1_Mods & SISrcMods::NEG) {
       Lo_src1_mods |= SISrcMods::NEG;
     }
-    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0
-  }
-  else {
-    Op0L_Op1L.addImm(Lo_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-  }
-  Op0L_Op1L.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  Op0L_Op1L.addImm(0); //omod
+    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    Op0L_Op1L.addImm(Lo_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0,
+                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
+  }
+  Op0L_Op1L.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0L_Op1L.addImm(0); // omod
 
   if (isVreg_64) {
     Op0L_Op1L->getOperand(0).setIsUndef();
-  }
-  else if (I.getOperand(0).isUndef()){
+  } else if (I.getOperand(0).isUndef()) {
     Op0L_Op1L->getOperand(0).setIsUndef();
   }
 
   LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
 
-  SrcSubIdx1 =
-      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
-  SrcSubIdx2 =
-      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
-  DestSubIdx =
-      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
 
-  //don't worry about abs values. Packed instructions (VOP3P) do not support them
+  // don't worry about abs values. Packed instructions (VOP3P) do not support
+  // them
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
   MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
-  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
   if (src0_Mods & SISrcMods::OP_SEL_1) {
     if (src0_Mods & SISrcMods::NEG_HI) {
       Hi_src0_mods |= SISrcMods::NEG;
     }
-    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0
-  }
-  else {
-    Op0H_Op1H.addImm(Hi_src0_mods); //src0_modifiers
-    unsigned Src0SubIdx = TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0,
+                     Src0SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
   }
 
   if (src1_Mods & SISrcMods::OP_SEL_1) {
     if (src1_Mods & SISrcMods::NEG_HI) {
       Hi_src1_mods |= SISrcMods::NEG;
     }
-    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0
-  }
-  else {
-    Op0H_Op1H.addImm(Hi_src1_mods); //src0_modifiers
-    unsigned Src1SubIdx = TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
-    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); //src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
-  }
-  Op0H_Op1H.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  Op0H_Op1H.addImm(0); //omod
+    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    Op0H_Op1H.addImm(Hi_src1_mods); // src0_modifiers
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0,
+                     Src1SubIdx); // src0 //if op_sel_hi == 0, select register 0
+                                  // of reg:sub0_sub1
+  }
+  Op0H_Op1H.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0H_Op1H.addImm(0); // omod
   LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
 
   if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
@@ -524,16 +550,15 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg1 = I.getOperand(2).getReg();
   Register SrcReg2 = I.getOperand(4).getReg();
-
   MachineOperand &DstMO = I.getOperand(0);
   MachineOperand &SrcMO1 = I.getOperand(2);
   MachineOperand &SrcMO2 = I.getOperand(4);
 
-  MachineBasicBlock::iterator MII = I;
   const DebugLoc &DL = I.getDebugLoc();
   const TargetRegisterClass *DstRC = MRI.getRegClass(I.getOperand(0).getReg());
   const TargetRegisterClass *Src0RC = MRI.getRegClass(I.getOperand(2).getReg());
   const TargetRegisterClass *Src1RC = MRI.getRegClass(I.getOperand(4).getReg());
+
   const TargetRegisterClass *Src0SubRC =
       TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
   const TargetRegisterClass *SrcRC = TRI->getSubClassWithSubReg(Src0RC, 1);
@@ -545,34 +570,38 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
     MachineInstr *CopySGPR2 = copyInstrs[1];
 
     bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-        I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
-        CopySGPR2->getOperand(0), isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(2).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(2).getReg(), TRI);
+    SmallVector<MachineInstr *, 2> unpackedInstrs =
+        insertUnpackedMI(I, DstMO, SrcMO1, CopySGPR1->getOperand(0), SrcMO1,
+                         CopySGPR2->getOperand(0), isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(
+        unpackedInstrs[0]->getOperand(2).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(
+        unpackedInstrs[1]->getOperand(2).getReg(), TRI);
     return;
-  }
-  else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
+  } else if (Src0RC->getID() == AMDGPU::SGPR_64RegClassID) {
     SmallVector<MachineInstr *, 2> copyInstrs = copyToVregAndInsertMI(I, 2);
     MachineInstr *CopySGPR1 = copyInstrs[0];
     MachineInstr *CopySGPR2 = copyInstrs[1];
 
     bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-    SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-        I, DstMO, CopySGPR1->getOperand(0), SrcMO2, CopySGPR2->getOperand(0), SrcMO2, isVReg64);
-    unpackedInstrs[0]->addRegisterKilled(unpackedInstrs[0]->getOperand(1).getReg(), TRI);
-    unpackedInstrs[1]->addRegisterKilled(unpackedInstrs[1]->getOperand(1).getReg(), TRI);
+    SmallVector<MachineInstr *, 2> unpackedInstrs =
+        insertUnpackedMI(I, DstMO, CopySGPR1->getOperand(0), SrcMO2,
+                         CopySGPR2->getOperand(0), SrcMO2, isVReg64);
+    unpackedInstrs[0]->addRegisterKilled(
+        unpackedInstrs[0]->getOperand(1).getReg(), TRI);
+    unpackedInstrs[1]->addRegisterKilled(
+        unpackedInstrs[1]->getOperand(1).getReg(), TRI);
     return;
   }
 
   bool isVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
-  SmallVector<MachineInstr *, 2> unpackedInstrs = insertUnpackedMI(
-          I, DstMO, SrcMO1, SrcMO2, SrcMO1,
-          SrcMO2, isVReg64);
+  SmallVector<MachineInstr *, 2> unpackedInstrs =
+      insertUnpackedMI(I, DstMO, SrcMO1, SrcMO2, SrcMO1, SrcMO2, isVReg64);
   return;
 }
 
-void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t AvailableBudget) {
+void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I,
+                                                    uint16_t AvailableBudget) {
   MachineBasicBlock &MBB = *I.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -585,7 +614,7 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   Register SrcReg1 = SrcMO1.getReg();
 
   const DebugLoc &DL = I.getDebugLoc();
-  
+
   const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
   auto SchedModel = TII->getSchedModel();
 
@@ -595,24 +624,25 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   auto BuildImm = [&](uint32_t Val) -> std::pair<Register, uint16_t> {
     Register ImmReg = MRI.createVirtualRegister(RC);
     auto newMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), ImmReg)
-        .addImm(Val);
+                     .addImm(Val);
     LIS->InsertMachineInstrInMaps(*newMI);
-    const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(newMI);
-    uint16_t LatencyCycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+    const MCSchedClassDesc *SchedClassDesc =
+        SchedModel.resolveSchedClass(newMI);
+    uint16_t LatencyCycles =
+        SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
     return {ImmReg, LatencyCycles};
   };
 
-
   if (!IsF16MaskSet) {
     std::pair<Register, uint16_t> RegAndLatency = BuildImm(0x0000FFFF);
-    MaskLo = RegAndLatency.first; //mask for lower 16 bits
+    MaskLo = RegAndLatency.first; // mask for lower 16 bits
     AddlCyclesConsumed += RegAndLatency.second;
     RegAndLatency = BuildImm(16);
-    ShiftAmt =  RegAndLatency.first; //mask for higher 16 bits
+    ShiftAmt = RegAndLatency.first; // mask for higher 16 bits
     AddlCyclesConsumed += RegAndLatency.second;
     IsF16MaskSet = true;
   }
-  
+
   Register Src0_Lo = MRI.createVirtualRegister(RC);
   Register Src1_Lo = MRI.createVirtualRegister(RC);
   Register Src0_Hi = MRI.createVirtualRegister(RC);
@@ -624,27 +654,33 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   if (SrcMO0.getSubReg())
     SubRegID = SrcMO0.getSubReg();
 
-  int src0_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
-  int src1_modifiers_Idx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  int src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
   unsigned src0_Mods = I.getOperand(src0_modifiers_Idx).getImm();
   unsigned src1_Mods = I.getOperand(src1_modifiers_Idx).getImm();
-  int clampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int clampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
   int64_t clampVal = I.getOperand(clampIdx).getImm();
 
   // handle op_sel for src0
   if (src0_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
-        .addReg(ShiftAmt);
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Lo)
+            .addReg(ShiftAmt);
     if (SubRegID)
       LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
       LoInput0_MI.addReg(SrcReg0);
     LIS->InsertMachineInstrInMaps(*LoInput0_MI);
-  }
-  else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
+  } else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Lo);
     if (SubRegID)
       LoInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
@@ -655,18 +691,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
 
   // handle op_sel_hi for src0
   if (src0_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
-        .addReg(ShiftAmt);
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder HiInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src0_Hi)
+            .addReg(ShiftAmt);
     if (SubRegID)
       HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
       HiInput0_MI.addReg(SrcReg0);
     LIS->InsertMachineInstrInMaps(*HiInput0_MI);
-  }
-  else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput0_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
+  } else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
+    // of new vgpr
+    MachineInstrBuilder HiInput0_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src0_Hi);
     if (SubRegID)
       HiInput0_MI.addReg(SrcReg0, 0, SubRegID);
     else
@@ -680,18 +719,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
     SubRegID = SrcMO1.getSubReg();
   // handle op_sel for src1
   if (src1_Mods & SISrcMods::OP_SEL_0) {
-    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
-        .addReg(ShiftAmt);
+    // if op_sel is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Lo)
+            .addReg(ShiftAmt);
     if (SubRegID)
       LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
       LoInput1_MI.addReg(SrcReg1);
     LIS->InsertMachineInstrInMaps(*LoInput1_MI);
-  }
-  else {
-    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder LoInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
+  } else {
+    // if op_sel is not set, select lower 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder LoInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Lo);
     if (SubRegID)
       LoInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
@@ -702,18 +744,21 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
 
   // handle op_sel_hi for src1
   if (src1_Mods & SISrcMods::OP_SEL_1) {
-    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
-        .addReg(ShiftAmt);
+    // if op_sel_hi is set, select higher 16 bits and copy into lower 16 bits of
+    // new vgpr
+    MachineInstrBuilder HiInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), Src1_Hi)
+            .addReg(ShiftAmt);
     if (SubRegID)
       HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
       HiInput1_MI.addReg(SrcReg1);
     LIS->InsertMachineInstrInMaps(*HiInput1_MI);
-  }
-  else {
-    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits of new vgpr
-    MachineInstrBuilder HiInput1_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
+  } else {
+    // if op_sel_hi is not set, select lower 16 bits and copy into lower 16 bits
+    // of new vgpr
+    MachineInstrBuilder HiInput1_MI =
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_AND_B32_e32), Src1_Hi);
     if (SubRegID)
       HiInput1_MI.addReg(SrcReg1, 0, SubRegID);
     else
@@ -728,75 +773,81 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   unsigned Lo_src0_mods = 0;
   unsigned Lo_src1_mods = 0;
   uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
-  
+
   // Unpacked instructions
-  MachineInstrBuilder LoMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
+  MachineInstrBuilder LoMul_MI =
+      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), LoMul);
 
-  if (src0_Mods & SISrcMods::NEG) 
+  if (src0_Mods & SISrcMods::NEG)
     Lo_src0_mods |= SISrcMods::NEG;
 
-  LoMul_MI.addImm(Lo_src0_mods); //src0_modifiers
-  LoMul_MI.addReg(Src0_Lo, RegState::Kill); //src0
+  LoMul_MI.addImm(Lo_src0_mods);            // src0_modifiers
+  LoMul_MI.addReg(Src0_Lo, RegState::Kill); // src0
 
   if (src1_Mods & SISrcMods::NEG)
     Lo_src1_mods |= SISrcMods::NEG;
 
-  LoMul_MI.addImm(Lo_src1_mods); //src1_modifiers
-  LoMul_MI.addReg(Src1_Lo, RegState::Kill); //src1
-  LoMul_MI.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  LoMul_MI.addImm(0); //omod
+  LoMul_MI.addImm(Lo_src1_mods);            // src1_modifiers
+  LoMul_MI.addReg(Src1_Lo, RegState::Kill); // src1
+  LoMul_MI.addImm(clampVal);                // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  LoMul_MI.addImm(0); // omod
 
-  // unpacked instruction with VOP3 encoding for Hi bits 
+  // unpacked instruction with VOP3 encoding for Hi bits
   unsigned Hi_src0_mods = 0;
   unsigned Hi_src1_mods = 0;
 
-  MachineInstrBuilder HiMul_MI = BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
-  if (src0_Mods & SISrcMods::NEG_HI) 
+  MachineInstrBuilder HiMul_MI =
+      BuildMI(MBB, I, DL, TII->get(unpackedOpcode), HiMul);
+  if (src0_Mods & SISrcMods::NEG_HI)
     Hi_src0_mods |= SISrcMods::NEG_HI;
-  
-  HiMul_MI.addImm(Hi_src0_mods); //src0_modifiers
-  HiMul_MI.addReg(Src0_Hi, RegState::Kill); //select higher 16 bits if op_sel_hi is set
+
+  HiMul_MI.addImm(Hi_src0_mods); // src0_modifiers
+  HiMul_MI.addReg(Src0_Hi,
+                  RegState::Kill); // select higher 16 bits if op_sel_hi is set
 
   if (src1_Mods & SISrcMods::NEG_HI)
     Hi_src1_mods |= SISrcMods::NEG_HI;
-  
-  HiMul_MI.addImm(Hi_src1_mods); //src0_modifiers
-  HiMul_MI.addReg(Src1_Hi, RegState::Kill); //select higher 16 bits from src1 if op_sel_hi is set
-  HiMul_MI.addImm(clampVal); //clamp
-  //packed instructions do not support output modifiers. safe to assign them 0 for this use case
-  HiMul_MI.addImm(0); //omod
+
+  HiMul_MI.addImm(Hi_src1_mods); // src0_modifiers
+  HiMul_MI.addReg(
+      Src1_Hi,
+      RegState::Kill); // select higher 16 bits from src1 if op_sel_hi is set
+  HiMul_MI.addImm(clampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  HiMul_MI.addImm(0); // omod
 
   // Shift HiMul left by 16
   Register HiMulShifted = MRI.createVirtualRegister(RC);
-  MachineInstrBuilder HiMulShifted_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
-      .addReg(ShiftAmt)
-      .addReg(HiMul);
+  MachineInstrBuilder HiMulShifted_MI =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_LSHLREV_B32_e64), HiMulShifted)
+          .addReg(ShiftAmt)
+          .addReg(HiMul);
 
   SubRegID = 0;
   if (DstMO.getSubReg())
     SubRegID = DstMO.getSubReg();
   // OR LoMul | (HiMul << 16)
-  MachineInstrBuilder RewriteBackToDst_MI = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
+  MachineInstrBuilder RewriteBackToDst_MI =
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_OR_B32_e64));
   if (SubRegID) {
-    if (DstMO.isUndef()){
+    if (DstMO.isUndef()) {
       RewriteBackToDst_MI.addDef(DstReg, RegState::Undef, SubRegID);
-    }
-    else {
+    } else {
       RewriteBackToDst_MI.addDef(DstReg, 0, SubRegID);
     }
-  }
-  else {
-    if (DstMO.isUndef()){
+  } else {
+    if (DstMO.isUndef()) {
       RewriteBackToDst_MI.addDef(DstReg, RegState::Undef);
-    }
-    else {
+    } else {
       RewriteBackToDst_MI.addDef(DstReg);
     }
   }
   RewriteBackToDst_MI.addReg(LoMul);
   RewriteBackToDst_MI.addReg(HiMulShifted);
-  
+
   LIS->InsertMachineInstrInMaps(*LoMul_MI);
   LIS->InsertMachineInstrInMaps(*HiMul_MI);
   LIS->InsertMachineInstrInMaps(*HiMulShifted_MI);
@@ -805,7 +856,6 @@ void GCNPreRAOptimizationsImpl::processF16Unpacking(MachineInstr &I, uint16_t Av
   I.eraseFromParent();
   LIS->removeInterval(DstReg);
   LIS->createAndComputeVirtRegInterval(DstReg);
-  
 }
 
 bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
@@ -844,19 +894,22 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   }
 
   // Add RA hints to improve True16 COPY elimination.
-  // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
+  // Unpack packed instructions to overlap MFMAs. This allows the compiler to
+  // co-issue unpacked instructions with MFMA
   for (MachineBasicBlock &MBB : MF) {
     SetVector<MachineInstr *> instrsToUnpack;
     IsF16MaskSet = false;
     uint16_t NumMFMACycles = 0;
     auto SchedModel = TII->getSchedModel();
     for (MachineInstr &MI : MBB) {
-      if (SIInstrInfo::isMFMA(MI)){
-        const MCSchedClassDesc *SchedClassDesc = SchedModel.resolveSchedClass(&MI);
-        NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+      if (SIInstrInfo::isMFMA(MI)) {
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
         createListOfPackedInstr(MI, instrsToUnpack, NumMFMACycles);
       }
-      if (ST.useRealTrue16Insts()){
+      if (ST.useRealTrue16Insts()) {
         if (MI.getOpcode() != AMDGPU::COPY)
           continue;
         Register Dst = MI.getOperand(0).getReg();
@@ -883,13 +936,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
           MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
       }
     }
-    
+
     if (!instrsToUnpack.empty()) {
       for (MachineInstr *MI : instrsToUnpack) {
-        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) || (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+        if ((MI->getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+            (MI->getOpcode() == AMDGPU::V_PK_ADD_F16)) {
           processF16Unpacking(*MI, NumMFMACycles);
-        }
-        else {
+        } else {
           insertMI(*MI);
         }
       }
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
new file mode 100644
index 0000000000000..b13f61a963ed5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-scheduler.mir
@@ -0,0 +1,209 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=amdgpu-pre-ra-optimizations -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            test_pk_mul_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_pk_mul_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+
+    ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub5, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub4_sub5:sgpr_512, 12, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_op_sel_hi_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+    ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: KILL %1.sub6_sub7
+    ; GCN-NEXT: early-clobber %5:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_128_align2 = COPY [[S_LOAD_DWORDX4_IMM1]]
+    ; GCN-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub6
+    ; GCN-NEXT: [[COPY3:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub7
+    ; GCN-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64_align2 = COPY %4.sub4
+    ; GCN-NEXT: undef [[V_PK_MUL_F32_:%[0-9]+]].sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %5.sub6_sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %11:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY4:%[0-9]+]].sub1:vreg_64_align2 = COPY %4.sub5
+    ; GCN-NEXT: [[V_PK_MUL_F32_:%[0-9]+]].sub0:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead [[V_PK_MUL_F32_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_MUL_F32_e64 0, %5.sub4, 0, [[COPY4]].sub1, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub4_sub5, 0, 0
+    %23:sgpr_128 = S_LOAD_DWORDX4_IMM %8.sub6_sub7, 0, 0
+    early-clobber %39:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub6_sub7, 0, 0
+    KILL %8.sub6_sub7
+    early-clobber %24:sgpr_512 = S_LOAD_DWORDX16_IMM_ec %8.sub4_sub5, 0, 0
+    %57:vreg_128_align2 = COPY %22
+    %58:vreg_128_align2 = COPY %23
+    undef %69.sub0:vreg_64_align2 = COPY %39.sub6
+    %69.sub1:vreg_64_align2 = COPY %39.sub7
+    undef %75.sub0:vreg_64_align2 = COPY %39.sub4
+    undef %179.sub2_sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F32 8, %24.sub6_sub7, 8, %69, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %56:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %57, %58, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %75.sub1:vreg_64_align2 = COPY %39.sub5
+    %179.sub0_sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F32 0, %24.sub4_sub5:sgpr_512, 8, %75:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_only_overlapped_unpacking_f16
+tracksRegLiveness: true
+liveins:
+  - { reg: '$sgpr4_sgpr5', virtual-reg: '%3' }
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5
+    ; GCN-LABEL: name: test_only_overlapped_unpacking_f16
+    ; GCN: liveins: $sgpr4_sgpr5
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    ; GCN-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]](p4), 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: early-clobber %4:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub6_sub7, 0, 0
+    ; GCN-NEXT: dead [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+    ; GCN-NEXT: early-clobber %6:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %1.sub4_sub5, 0, 0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM]]
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_LOAD_DWORDX2_IMM1]]
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub7
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY %4.sub6
+    ; GCN-NEXT: undef [[V_PK_MUL_F16_:%[0-9]+]].sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub7, 8, [[COPY3]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: dead early-clobber %12:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 [[COPY1]], [[COPY2]], 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 %6.sub6, [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], %6.sub6, implicit $exec
+    ; GCN-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 [[COPY4]], [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[V_MOV_B32_e32_1]], [[COPY4]], implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_AND_B32_e32_]], 0, killed [[V_AND_B32_e32_1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, killed [[V_LSHRREV_B32_e64_]], 0, killed [[V_LSHRREV_B32_e64_1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[V_MOV_B32_e32_1]], [[V_MUL_F16_e64_1]], implicit $exec
+    ; GCN-NEXT: [[V_PK_MUL_F16_:%[0-9]+]].sub2:vreg_128_align2 = V_OR_B32_e64 [[V_MUL_F16_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY %4.sub5
+    ; GCN-NEXT: dead [[V_PK_MUL_F16_:%[0-9]+]].sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %6.sub5, 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0
+    %3:sgpr_64(p4) = COPY $sgpr4_sgpr5
+    early-clobber %8:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %3(p4), 0, 0
+    %22:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub4_sub5, 0, 0
+    %23:sreg_64_xexec = S_LOAD_DWORDX2_IMM %8.sub6_sub7, 0, 0
+    early-clobber %25:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub6_sub7, 0, 0
+    %12:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+    early-clobber %24:sgpr_256 = S_LOAD_DWORDX8_IMM_ec %8.sub4_sub5, 0, 0
+    %29:vreg_64_align2 = COPY %22
+    %30:vreg_64_align2 = COPY %23
+    %51:vgpr_32 = COPY %25.sub7
+    %55:vgpr_32 = COPY %25.sub6
+    undef %99.sub3:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub7, 8, %51, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %28:areg_256_align2 = V_MFMA_F64_16X16X4F64_e64 %29, %30, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %99.sub2:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub6, 8, %55, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vgpr_32 = COPY %25.sub5
+    %99.sub1:vreg_128_align2 = nofpexcept V_PK_MUL_F16 8, %24.sub5, 8, %59, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0