[llvm] unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)
Akash Dutta via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 11 07:32:34 PDT 2025
https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/157968
>From c15104f964cb8c103e8e4aede15e561a853e0947 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 10 Sep 2025 18:04:47 -0500
Subject: [PATCH 1/2] unpack packed instructions overlapped by MFMAs post-RA
scheduling
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 60 ++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 295 +++++++++++++++++-
...ck-non-coissue-insts-post-ra-scheduler.mir | 151 +++++++++
4 files changed, 503 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 398c99b3bd127..8fce521da157e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6359,6 +6359,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return isImmOperandLegal(MI, OpIdx, *MO);
}
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+ bool IsGFX950Only = ST.hasGFX950Insts();
+ bool IsGFX940Only = ST.hasGFX940Insts();
+
+ if (!IsGFX950Only && !IsGFX940Only)
+ return false;
+
+ if (!isVALU(MI))
+ return false;
+
+ // V_COS, V_EXP, V_RCP, etc.
+ if (isTRANS(MI))
+ return true;
+
+ // DOT2, DOT2C, DOT4, etc.
+ if (isDOT(MI))
+ return true;
+
+ // MFMA, SMFMA
+ if (isMFMA(MI))
+ return true;
+
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_CVT_PK_BF8_F32_e64:
+ case AMDGPU::V_CVT_PK_FP8_F32_e64:
+ case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+ case AMDGPU::V_MQSAD_U32_U8_e64:
+ case AMDGPU::V_PK_ADD_F16:
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_I16:
+ case AMDGPU::V_PK_ADD_U16:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_FMA_F16:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMAC_F16_e32:
+ case AMDGPU::V_PK_FMAC_F16_e64:
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_MAD_I16:
+ case AMDGPU::V_PK_MAD_U16:
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_PK_MAX_I16:
+ case AMDGPU::V_PK_MAX_U16:
+ case AMDGPU::V_PK_MIN_F16:
+ case AMDGPU::V_PK_MIN_I16:
+ case AMDGPU::V_PK_MIN_U16:
+ case AMDGPU::V_PK_MOV_B32:
+ case AMDGPU::V_PK_MUL_F16:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_LO_U16:
+ case AMDGPU::V_PK_SUB_I16:
+ case AMDGPU::V_PK_SUB_U16:
+ case AMDGPU::V_QSAD_PK_U16_U8_e64:
+ return true;
+ default:
+ return false;
+ }
+}
+
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7dde2b90b68e..d0b49ffc19600 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1200,6 +1200,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return isImmOperandLegal(MI.getDesc(), OpNo, MO);
}
+ bool isNeverCoissue(MachineInstr &MI) const;
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 2c2ceedf8a2f6..cad096e0d2fcc 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -9,11 +9,19 @@
/// \file
/// This pass performs the peephole optimizations before code emission.
///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
+/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
+/// co-issued. This helps with overlapping MFMA and certain vector instructions
+/// in machine schedules and is expected to improve performance. Only those
+/// packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
+/// TODO: Add support for F16 packed instructions
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"
@@ -28,6 +36,7 @@ class SIPreEmitPeephole {
private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
+ MachineRegisterInfo *MRI;
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
@@ -39,6 +48,37 @@ class SIPreEmitPeephole {
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
+ // Check if the machine instruction being processed is a supported packed
+ // instruction
+ bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+ // Creates a list of packed instructions following an MFMA that are suitable
+ // for unpacking.
+ void createListOfPackedInstr(MachineInstr &BeginMI,
+ SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles);
+ // Identify register dependencies between those used by the MFMA
+ // instruction and the following packed instructions. Conservatively ensures
+ // that we do not incorrectly read/write registers.
+ bool hasReadWriteDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI);
+ // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
+ // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
+ // transformation.
+ void processF32Unpacking(MachineInstr &I);
+ // Insert appropriate unpacked instructions into the BB
+ void insertUnpackedF32MI(MachineInstr &I, bool IsFMA);
+ // Select corresponding unpacked instruction from packed instruction as input
+ uint16_t mapToUnpackedOpcode(MachineInstr &I);
+ // Creates the unpacked instruction to be inserted. Adds source modifiers to
+ // the unpacked instructions based on the source modifiers in the packed
+ // instruction
+ MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+ const DebugLoc &DL,
+ uint16_t UnpackedOpcode, bool IsHiBits,
+ bool IsFMA);
+ void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+ unsigned NegModifier, unsigned OpSelModifier,
+ MachineOperand &SrcMO);
public:
bool run(MachineFunction &MF);
@@ -274,11 +314,9 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
return false;
if (IdxReg && I->modifiesRegister(IdxReg, TRI))
return false;
- if (llvm::any_of(I->operands(),
- [&MRI, this](const MachineOperand &MO) {
- return MO.isReg() &&
- TRI->isVectorRegister(MRI, MO.getReg());
- })) {
+ if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
+ })) {
// The only exception allowed here is another indirect vector move
// with the same mode.
if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
@@ -417,6 +455,233 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_FMA_F32:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI) {
+ for (const MachineOperand &Pred_Ops : PredMI.operands()) {
+ if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+ continue;
+ Register Pred_Reg = Pred_Ops.getReg();
+ if (!Pred_Reg.isValid())
+ continue;
+ for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
+ if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+ continue;
+ Register Succ_Reg = Succ_Ops.getReg();
+ if (!Succ_Reg.isValid())
+ continue;
+ if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+ unsigned Opcode = I.getOpcode();
+ // Use 64 bit encoding to allow use of VOP3 instructions.
+ // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+ // e32 instructions are VOP2 and don't allow source modifiers
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::V_PK_MUL_F32:
+ return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::V_PK_FMA_F32:
+ return AMDGPU::V_FMA_F32_e64;
+ default:
+ return std::numeric_limits<uint16_t>::max();
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI,
+ unsigned Src_Mods,
+ unsigned NegModifier,
+ unsigned OpSelModifier,
+ MachineOperand &SrcMO) {
+ unsigned New_Src_Mods = 0;
+ const TargetRegisterInfo *RI = SrcMO.getParent()
+ ->getParent()
+ ->getParent()
+ ->getSubtarget()
+ .getRegisterInfo();
+ // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+ // lane.
+ // NEG_HI shares the same bit position with ABS. But packed instructions do
+ // not support ABS. Therefore, NEG_HI must be translated to NEG source
+ // modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+ // ABS, therefore we need to explicitly add the NEG modifier if present in
+ // the packed instruction
+ if (Src_Mods & NegModifier) {
+ New_Src_Mods |= SISrcMods::NEG;
+ }
+ // Src modifiers. Only negative modifiers are added if needed. Unpacked
+ // operations do not have op_sel, therefore it must be handled explicitly as
+ // done below. Unpacked operations support abs, but packed instructions do
+ // not. Thus, abs is not handled.
+ NewMI.addImm(New_Src_Mods);
+ if (SrcMO.isImm()) {
+ NewMI.addImm(SrcMO.getImm());
+ } else {
+ // If op_sel == 0, select register 0 of reg:sub0_sub1
+ Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
+ ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+ : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+ if (SrcMO.isReg() && SrcMO.isKill())
+ NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+ else
+ NewMI.addReg(UnpackedSrcReg);
+ }
+}
+
+void SIPreEmitPeephole::createListOfPackedInstr(
+ MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles) {
+ auto *BB = BeginMI.getParent();
+ auto E = BB->end();
+ int TotalCyclesBetweenCandidates = 0;
+ auto SchedModel = TII->getSchedModel();
+ for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+ MachineInstr &Instr = *I;
+ const MCSchedClassDesc *InstrSchedClassDesc =
+ SchedModel.resolveSchedClass(&Instr);
+ TotalCyclesBetweenCandidates +=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+ if (Instr.isMetaInstruction())
+ continue;
+ if (Instr.isTerminator())
+ return;
+ if (TotalCyclesBetweenCandidates > NumMFMACycles)
+ return;
+ if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+ if (hasReadWriteDependencies(BeginMI, Instr))
+ continue;
+
+ // If it is a packed instruction, we should subtract it's latency from the
+ // overall latency calculation here, because the packed instruction will
+ // be removed and replaced by 2 unpacked instructions
+ TotalCyclesBetweenCandidates -=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+ // We're adding 2 to account for the extra latency added by unpacking into
+ // 2 instructions. At the time of writing, the considered unpacked
+ // instructions have latency of 1.
+ // TODO: improve latency handling of possible inserted instructions
+ TotalCyclesBetweenCandidates += 2;
+ // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) {
+ InstrsToUnpack.insert(&Instr);
+ // }
+ }
+ }
+ return;
+}
+
+void SIPreEmitPeephole::processF32Unpacking(MachineInstr &I) {
+ if (SIInstrInfo::modifiesModeRegister(I) ||
+ I.modifiesRegister(AMDGPU::EXEC, TRI))
+ return;
+ bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
+ insertUnpackedF32MI(I, IsFMA);
+ return;
+}
+
+void SIPreEmitPeephole::insertUnpackedF32MI(MachineInstr &I, bool IsFMA) {
+ MachineBasicBlock &MBB = *I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register DstReg = I.getOperand(0).getReg();
+
+ uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+ if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+ return;
+
+ MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
+ MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
+ if (I.getOperand(0).isUndef())
+ Op0L_Op1L->getOperand(0).setIsUndef();
+
+ MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
+ MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
+
+ if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+ Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ }
+ if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+ Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
+ Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+ }
+ if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) {
+ Op0L_Op1L.getInstr()->getOperand(0).setIsRenamable(true);
+ Op0H_Op1H.getInstr()->getOperand(0).setIsRenamable(true);
+ }
+
+ I.eraseFromParent();
+ return;
+}
+
+MachineInstrBuilder
+SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+ const DebugLoc &DL, uint16_t UnpackedOpcode,
+ bool IsHiBits, bool IsFMA) {
+ MachineOperand &DstMO = I.getOperand(0);
+ MachineOperand &SrcMO1 = I.getOperand(2);
+ MachineOperand &SrcMO2 = I.getOperand(4);
+ Register DstReg = DstMO.getReg();
+ const TargetRegisterInfo *RI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ Register UnpackedDstReg = IsHiBits ? RI->getSubReg(DstReg, AMDGPU::sub1)
+ : RI->getSubReg(DstReg, AMDGPU::sub0);
+
+ int ClampIdx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+ int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+ int Src0_modifiers_Idx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+ int Src1_modifiers_Idx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+
+ unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+ unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+ // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+ unsigned New_Src0_Mods = 0;
+ unsigned New_Src1_Mods = 0;
+
+ unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+ unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+
+ MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+ NewMI.addDef(UnpackedDstReg); // vdst
+ addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
+ addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);
+
+ if (IsFMA) {
+ MachineOperand &SrcMO3 = I.getOperand(6);
+ int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
+ I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+ unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
+ addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
+ }
+ NewMI.addImm(ClampVal); // clamp
+ // Packed instructions do not support output modifiers. safe to assign them 0
+ // for this use case
+ NewMI.addImm(0); // omod
+ return NewMI;
+}
+
PreservedAnalyses
llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
@@ -430,6 +695,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
bool Changed = false;
MF.RenumberBlocks();
@@ -461,7 +727,21 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// and limit the distance to 20 instructions for compile time purposes.
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
// may be bundled with the instructions they modify.
+ //
+ // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+ // to co-issue unpacked instructions with MFMA
+ uint16_t NumMFMACycles = 0;
+ auto SchedModel = TII->getSchedModel();
+ SetVector<MachineInstr *> InstrsToUnpack;
+
for (auto &MI : make_early_inc_range(MBB.instrs())) {
+ if (SIInstrInfo::isMFMA(MI)) {
+ const MCSchedClassDesc *SchedClassDesc =
+ SchedModel.resolveSchedClass(&MI);
+ NumMFMACycles =
+ SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+ createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
+ }
if (Count == Threshold)
SetGPRMI = nullptr;
else
@@ -481,6 +761,11 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
else
SetGPRMI = &MI;
}
+ if (!InstrsToUnpack.empty()) {
+ for (MachineInstr *MI : InstrsToUnpack) {
+ processF32Unpacking(*MI);
+ }
+ }
}
return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
new file mode 100644
index 0000000000000..532344ea9cbd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: test_pk_mul_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_pk_mul_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_op_sel_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5'}
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_op_sel_hi_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5'}
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
>From fb05da7a164c5227f8cda6c7055ce879c98c7844 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 11 Sep 2025 09:32:08 -0500
Subject: [PATCH 2/2] format error fix and unpack candidate selection condition
change
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 68 ++++++++++----------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 5 +-
3 files changed, 37 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8fce521da157e..2048f61e1486a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6383,40 +6383,40 @@ bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- case AMDGPU::V_CVT_PK_BF8_F32_e64:
- case AMDGPU::V_CVT_PK_FP8_F32_e64:
- case AMDGPU::V_MQSAD_PK_U16_U8_e64:
- case AMDGPU::V_MQSAD_U32_U8_e64:
- case AMDGPU::V_PK_ADD_F16:
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_ADD_I16:
- case AMDGPU::V_PK_ADD_U16:
- case AMDGPU::V_PK_ASHRREV_I16:
- case AMDGPU::V_PK_FMA_F16:
- case AMDGPU::V_PK_FMA_F32:
- case AMDGPU::V_PK_FMAC_F16_e32:
- case AMDGPU::V_PK_FMAC_F16_e64:
- case AMDGPU::V_PK_LSHLREV_B16:
- case AMDGPU::V_PK_LSHRREV_B16:
- case AMDGPU::V_PK_MAD_I16:
- case AMDGPU::V_PK_MAD_U16:
- case AMDGPU::V_PK_MAX_F16:
- case AMDGPU::V_PK_MAX_I16:
- case AMDGPU::V_PK_MAX_U16:
- case AMDGPU::V_PK_MIN_F16:
- case AMDGPU::V_PK_MIN_I16:
- case AMDGPU::V_PK_MIN_U16:
- case AMDGPU::V_PK_MOV_B32:
- case AMDGPU::V_PK_MUL_F16:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_MUL_LO_U16:
- case AMDGPU::V_PK_SUB_I16:
- case AMDGPU::V_PK_SUB_U16:
- case AMDGPU::V_QSAD_PK_U16_U8_e64:
- return true;
- default:
- return false;
- }
+ case AMDGPU::V_CVT_PK_BF8_F32_e64:
+ case AMDGPU::V_CVT_PK_FP8_F32_e64:
+ case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+ case AMDGPU::V_MQSAD_U32_U8_e64:
+ case AMDGPU::V_PK_ADD_F16:
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_I16:
+ case AMDGPU::V_PK_ADD_U16:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_FMA_F16:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMAC_F16_e32:
+ case AMDGPU::V_PK_FMAC_F16_e64:
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_MAD_I16:
+ case AMDGPU::V_PK_MAD_U16:
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_PK_MAX_I16:
+ case AMDGPU::V_PK_MAX_U16:
+ case AMDGPU::V_PK_MIN_F16:
+ case AMDGPU::V_PK_MIN_I16:
+ case AMDGPU::V_PK_MIN_U16:
+ case AMDGPU::V_PK_MOV_B32:
+ case AMDGPU::V_PK_MUL_F16:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_LO_U16:
+ case AMDGPU::V_PK_SUB_I16:
+ case AMDGPU::V_PK_SUB_U16:
+ case AMDGPU::V_QSAD_PK_U16_U8_e64:
+ return true;
+ default:
+ return false;
+ }
}
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d0b49ffc19600..2f512eac41911 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1201,7 +1201,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}
bool isNeverCoissue(MachineInstr &MI) const;
-
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index cad096e0d2fcc..e3d9ac7b031d2 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -582,9 +582,8 @@ void SIPreEmitPeephole::createListOfPackedInstr(
// instructions have latency of 1.
// TODO: improve latency handling of possible inserted instructions
TotalCyclesBetweenCandidates += 2;
- // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) {
- InstrsToUnpack.insert(&Instr);
- // }
+ if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
+ InstrsToUnpack.insert(&Instr);
}
}
return;
More information about the llvm-commits
mailing list