[llvm] Co-issue packed instructions by unpacking (PR #151704)

Mon Aug 25 09:14:32 PDT 2025

================
@@ -225,6 +254,712 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
   return true;
 }
 
+bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
+    MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+  case AMDGPU::V_PK_MUL_F32:
+  case AMDGPU::V_PK_MUL_F16:
+  case AMDGPU::V_PK_ADD_F16:
+  case AMDGPU::V_PK_FMA_F32:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
+  unsigned Opcode = I.getOpcode();
+  // use 64 bit encoding to allow use of VOP3 instructions.
+  // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+  // e32 instructions are VOP2 and don't allow source modifiers
+  switch (Opcode) {
+  case AMDGPU::V_PK_ADD_F32:
+    return AMDGPU::V_ADD_F32_e64;
+  case AMDGPU::V_PK_MUL_F32:
+    return AMDGPU::V_MUL_F32_e64;
+  case AMDGPU::V_PK_ADD_F16:
+    return AMDGPU::V_ADD_F16_e64;
+  case AMDGPU::V_PK_MUL_F16:
+    return AMDGPU::V_MUL_F16_e64;
+  case AMDGPU::V_PK_FMA_F32:
+    return AMDGPU::V_FMA_F32_e64;
+  default:
+    return std::numeric_limits<uint16_t>::max();
+  }
+}
+
+bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
+    MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+    uint16_t NumMFMACycles) {
+  auto *BB = BeginMI.getParent();
+  auto *MF = BB->getParent();
+  int NumInst = 0;
+
+  auto E = BB->end();
+
+  int TotalCyclesBetweenCandidates = 0;
+  auto SchedModel = TII->getSchedModel();
+  for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+    MachineInstr &Instr = *I;
+    const MCSchedClassDesc *InstrSchedClassDesc =
+        SchedModel.resolveSchedClass(&Instr);
+    TotalCyclesBetweenCandidates +=
+        SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+    if (Instr.isMetaInstruction())
+      continue;
+
+    if (Instr.isTerminator())
+      return false;
+
+    if (TotalCyclesBetweenCandidates > NumMFMACycles)
+      return false;
+
+    if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+      if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F16) ||
+          (Instr.getOpcode() == AMDGPU::V_PK_ADD_F16)) {
+        // unpacking packed F16 instructions requires multiple instructions.
+        // Instructions are issued to extract lower and higher bits for each
+        // operand Instructions are then issued for 2 unpacked instructions, and
+        // additional instructions to put them back into the original
+        // destination register The following sequence of instructions are
+        // issued
+
+        // The next two are needed to move masks into vgprs. Ideally, immediates
+        // should be used. However, if one of the source operands are
+        // sgpr/sregs, then immediates are not allowed. Hence, the need to move
+        // these into vgprs
+
+        // vgpr_32 = V_MOV_B32_e32 65535
+        // vgpr_32 = V_MOV_B32_e32 16
+
+        // vgpr_32 = V_AND_B32_e32 sub1:sreg_64, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, sub1:sreg_64
+        // vgpr_32 = V_AND_B32_e32 vgpr_32, vgpr_32
+        // vgpr_32 = V_LSHRREV_B32_e64 vgpr_32, vgpr_32
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_MUL_F16_e64 0, killed vgpr_32, 0, killed vgpr_32, 0, 0
+        // vgpr_32 = V_LSHLREV_B32_e64 vgpr_32, vgpr_32
+        // dst_reg = V_OR_B32_e64 vgpr_32, vgpr_32
+
+        // we need to issue the MOV instructions above only once. Once these are
+        // issued, the IsF16MaskSet flag is set subsequent unpacking only needs
+        // to issue the remaining instructions The number of latency cycles for
+        // each instruction above is 1. It's hard coded into the code to reduce
+        // code complexity.
+        if (IsF16MaskSet)
+          TotalCyclesBetweenCandidates += 7;
+        else
+          TotalCyclesBetweenCandidates += 9;
+      } else
+        TotalCyclesBetweenCandidates += 1;
+
+      if (!(TotalCyclesBetweenCandidates > NumMFMACycles))
+        InstrsToUnpack.insert(&Instr);
+    }
+  }
+  return true;
+}
+
+void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(
+    MachineInstr &I, MachineOperand &DstMO, MachineOperand &LoSrcMO1,
+    MachineOperand &LoSrcMO2, MachineOperand &HiSrcMO1,
+    MachineOperand &HiSrcMO2, bool IsVreg_64) {
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = DstMO.getReg();
+
+  unsigned SrcSubIdx1 =
+      TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+  unsigned SrcSubIdx2 =
+      TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+  unsigned DestSubIdx =
+      TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
+
+  const MCInstrDesc InstrDesc = I.getDesc();
+
+  int ClampIdx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+  int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+
+  int Src0_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+  int Src1_modifiers_Idx =
+      AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+  unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+  unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+  unsigned Lo_src0_mods = 0;
+  unsigned Lo_src1_mods = 0;
+  uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+  if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+    return;
+
+  MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG) {
+    Lo_src0_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src0_mods); // src0_modifiers
+  if (Src0_Mods & SISrcMods::OP_SEL_0) {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub0);
+    Op0L_Op1L.addReg(LoSrcMO1.getReg(), 0,
+                     Src0SubIdx); // src0 //if op_sel == 0, select register 0 of
+                                  // reg:sub0_sub1
+  }
+  if (Src1_Mods & SISrcMods::NEG) {
+    Lo_src1_mods |= SISrcMods::NEG;
+  }
+  Op0L_Op1L.addImm(Lo_src1_mods); // src1_modifiers
+  if (Src1_Mods & SISrcMods::OP_SEL_0) {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub0);
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0L_Op1L.addReg(LoSrcMO2.getReg(), 0, Src1SubIdx);
+  }
+  Op0L_Op1L.addImm(ClampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0L_Op1L.addImm(0); // omod
+
+  if (I.getOperand(0).isUndef()) {
+    Op0L_Op1L->getOperand(0).setIsUndef();
+  }
+  LIS->InsertMachineInstrInMaps(*Op0L_Op1L);
+  SrcSubIdx1 = TRI->composeSubRegIndices(LoSrcMO1.getSubReg(), AMDGPU::sub1);
+  SrcSubIdx2 = TRI->composeSubRegIndices(LoSrcMO2.getSubReg(), AMDGPU::sub1);
+  DestSubIdx = TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1);
+
+  // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+  unsigned Hi_src0_mods = 0;
+  unsigned Hi_src1_mods = 0;
+  MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+  Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); // vdst
+  if (Src0_Mods & SISrcMods::NEG_HI) {
+    Hi_src0_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src0_mods); // src0_modifiers
+  if (Src0_Mods & SISrcMods::OP_SEL_1) {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx); // src0
+  } else {
+    unsigned Src0SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO1.getSubReg(), AMDGPU::sub0);
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(HiSrcMO1.getReg(), 0, Src0SubIdx);
+  }
+  if (Src1_Mods & SISrcMods::NEG_HI) {
+    Hi_src1_mods |= SISrcMods::NEG_HI;
+  }
+  Op0H_Op1H.addImm(Hi_src1_mods); // src1_modifiers
+  if (Src1_Mods & SISrcMods::OP_SEL_1) {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub1);
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx); // src0
+  } else {
+    unsigned Src1SubIdx =
+        TRI->composeSubRegIndices(HiSrcMO2.getSubReg(), AMDGPU::sub0);
+    // src0 //if op_sel_hi == 0, select register 0 of reg:sub0_sub1
+    Op0H_Op1H.addReg(HiSrcMO2.getReg(), 0, Src1SubIdx);
+  }
+  Op0H_Op1H.addImm(ClampVal); // clamp
+  // packed instructions do not support output modifiers. safe to assign them 0
+  // for this use case
+  Op0H_Op1H.addImm(0); // omod
+  LIS->InsertMachineInstrInMaps(*Op0H_Op1H);
+
+  if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+    Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+    Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+  }
+  LIS->RemoveMachineInstrFromMaps(I);
+  I.eraseFromParent();
+  LIS->removeInterval(DstReg);
+  LIS->createAndComputeVirtRegInterval(DstReg);
+  return;
+}
+
+void GCNPreRAOptimizationsImpl::processFMAF32Unpacking(MachineInstr &I) {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+
+  Register DstReg = I.getOperand(0).getReg();
+  Register SrcReg1 = I.getOperand(2).getReg();
+  Register SrcReg2 = I.getOperand(4).getReg();
+  Register SrcReg3 = I.getOperand(6).getReg();
----------------
jrbyrnes wrote:

Ditto

https://github.com/llvm/llvm-project/pull/151704