[llvm] [AMDGPU]: Unpack packed instructions overlapped by MFMAs post-RA scheduling (PR #157968)
Akash Dutta via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 14:13:10 PDT 2025
https://github.com/akadutta updated https://github.com/llvm/llvm-project/pull/157968
>From c15104f964cb8c103e8e4aede15e561a853e0947 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 10 Sep 2025 18:04:47 -0500
Subject: [PATCH 01/16] unpack packed instructions overlapped by MFMAs post-RA
scheduling
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 60 ++++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 295 +++++++++++++++++-
...ck-non-coissue-insts-post-ra-scheduler.mir | 151 +++++++++
4 files changed, 503 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 398c99b3bd127..8fce521da157e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6359,6 +6359,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return isImmOperandLegal(MI, OpIdx, *MO);
}
+bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
+ bool IsGFX950Only = ST.hasGFX950Insts();
+ bool IsGFX940Only = ST.hasGFX940Insts();
+
+ if (!IsGFX950Only && !IsGFX940Only)
+ return false;
+
+ if (!isVALU(MI))
+ return false;
+
+ // V_COS, V_EXP, V_RCP, etc.
+ if (isTRANS(MI))
+ return true;
+
+ // DOT2, DOT2C, DOT4, etc.
+ if (isDOT(MI))
+ return true;
+
+ // MFMA, SMFMA
+ if (isMFMA(MI))
+ return true;
+
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_CVT_PK_BF8_F32_e64:
+ case AMDGPU::V_CVT_PK_FP8_F32_e64:
+ case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+ case AMDGPU::V_MQSAD_U32_U8_e64:
+ case AMDGPU::V_PK_ADD_F16:
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_I16:
+ case AMDGPU::V_PK_ADD_U16:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_FMA_F16:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMAC_F16_e32:
+ case AMDGPU::V_PK_FMAC_F16_e64:
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_MAD_I16:
+ case AMDGPU::V_PK_MAD_U16:
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_PK_MAX_I16:
+ case AMDGPU::V_PK_MAX_U16:
+ case AMDGPU::V_PK_MIN_F16:
+ case AMDGPU::V_PK_MIN_I16:
+ case AMDGPU::V_PK_MIN_U16:
+ case AMDGPU::V_PK_MOV_B32:
+ case AMDGPU::V_PK_MUL_F16:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_LO_U16:
+ case AMDGPU::V_PK_SUB_I16:
+ case AMDGPU::V_PK_SUB_U16:
+ case AMDGPU::V_QSAD_PK_U16_U8_e64:
+ return true;
+ default:
+ return false;
+ }
+}
+
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MachineInstr &MI) const {
unsigned Opc = MI.getOpcode();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index f7dde2b90b68e..d0b49ffc19600 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1200,6 +1200,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return isImmOperandLegal(MI.getDesc(), OpNo, MO);
}
+ bool isNeverCoissue(MachineInstr &MI) const;
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 2c2ceedf8a2f6..cad096e0d2fcc 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -9,11 +9,19 @@
/// \file
/// This pass performs the peephole optimizations before code emission.
///
+/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
+/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
+/// co-issued. This helps with overlapping MFMA and certain vector instructions
+/// in machine schedules and is expected to improve performance. Only those
+/// packed instructions are unpacked that are overlapped by the MFMA latency.
+/// Rest should remain untouched.
+/// TODO: Add support for F16 packed instructions
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"
@@ -28,6 +36,7 @@ class SIPreEmitPeephole {
private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
+ MachineRegisterInfo *MRI;
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
@@ -39,6 +48,37 @@ class SIPreEmitPeephole {
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
+ // Check if the machine instruction being processed is a supported packed
+ // instruction
+ bool isUnpackingSupportedInstr(MachineInstr &MI) const;
+ // Creates a list of packed instructions following an MFMA that are suitable
+ // for unpacking.
+ void createListOfPackedInstr(MachineInstr &BeginMI,
+ SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles);
+ // Identify register dependencies between those used by the MFMA
+ // instruction and the following packed instructions. Conservatively ensures
+ // that we do not incorrectly read/write registers.
+ bool hasReadWriteDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI);
+ // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
+ // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
+ // transformation.
+ void processF32Unpacking(MachineInstr &I);
+ // Insert appropriate unpacked instructions into the BB
+ void insertUnpackedF32MI(MachineInstr &I, bool IsFMA);
+ // Select corresponding unpacked instruction from packed instruction as input
+ uint16_t mapToUnpackedOpcode(MachineInstr &I);
+ // Creates the unpacked instruction to be inserted. Adds source modifiers to
+ // the unpacked instructions based on the source modifiers in the packed
+ // instruction
+ MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+ const DebugLoc &DL,
+ uint16_t UnpackedOpcode, bool IsHiBits,
+ bool IsFMA);
+ void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+ unsigned NegModifier, unsigned OpSelModifier,
+ MachineOperand &SrcMO);
public:
bool run(MachineFunction &MF);
@@ -274,11 +314,9 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
return false;
if (IdxReg && I->modifiesRegister(IdxReg, TRI))
return false;
- if (llvm::any_of(I->operands(),
- [&MRI, this](const MachineOperand &MO) {
- return MO.isReg() &&
- TRI->isVectorRegister(MRI, MO.getReg());
- })) {
+ if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
+ return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
+ })) {
// The only exception allowed here is another indirect vector move
// with the same mode.
if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
@@ -417,6 +455,233 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
+bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_FMA_F32:
+ return true;
+ default:
+ return false;
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI) {
+ for (const MachineOperand &Pred_Ops : PredMI.operands()) {
+ if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+ continue;
+ Register Pred_Reg = Pred_Ops.getReg();
+ if (!Pred_Reg.isValid())
+ continue;
+ for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
+ if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+ continue;
+ Register Succ_Reg = Succ_Ops.getReg();
+ if (!Succ_Reg.isValid())
+ continue;
+ if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
+ unsigned Opcode = I.getOpcode();
+ // Use 64 bit encoding to allow use of VOP3 instructions.
+ // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
+ // e32 instructions are VOP2 and don't allow source modifiers
+ switch (Opcode) {
+ case AMDGPU::V_PK_ADD_F32:
+ return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::V_PK_MUL_F32:
+ return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::V_PK_FMA_F32:
+ return AMDGPU::V_FMA_F32_e64;
+ default:
+ return std::numeric_limits<uint16_t>::max();
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
+void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI,
+ unsigned Src_Mods,
+ unsigned NegModifier,
+ unsigned OpSelModifier,
+ MachineOperand &SrcMO) {
+ unsigned New_Src_Mods = 0;
+ const TargetRegisterInfo *RI = SrcMO.getParent()
+ ->getParent()
+ ->getParent()
+ ->getSubtarget()
+ .getRegisterInfo();
+ // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+ // lane.
+ // NEG_HI shares the same bit position with ABS. But packed instructions do
+ // not support ABS. Therefore, NEG_HI must be translated to NEG source
+ // modifier for the higher 32 bits. Unpacked VOP3 instructions do support
+ // ABS, therefore we need to explicitly add the NEG modifier if present in
+ // the packed instruction
+ if (Src_Mods & NegModifier) {
+ New_Src_Mods |= SISrcMods::NEG;
+ }
+ // Src modifiers. Only negative modifiers are added if needed. Unpacked
+ // operations do not have op_sel, therefore it must be handled explicitly as
+ // done below. Unpacked operations support abs, but packed instructions do
+ // not. Thus, abs is not handled.
+ NewMI.addImm(New_Src_Mods);
+ if (SrcMO.isImm()) {
+ NewMI.addImm(SrcMO.getImm());
+ } else {
+ // If op_sel == 0, select register 0 of reg:sub0_sub1
+ Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
+ ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+ : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+ if (SrcMO.isReg() && SrcMO.isKill())
+ NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+ else
+ NewMI.addReg(UnpackedSrcReg);
+ }
+}
+
+void SIPreEmitPeephole::createListOfPackedInstr(
+ MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles) {
+ auto *BB = BeginMI.getParent();
+ auto E = BB->end();
+ int TotalCyclesBetweenCandidates = 0;
+ auto SchedModel = TII->getSchedModel();
+ for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
+ MachineInstr &Instr = *I;
+ const MCSchedClassDesc *InstrSchedClassDesc =
+ SchedModel.resolveSchedClass(&Instr);
+ TotalCyclesBetweenCandidates +=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+
+ if (Instr.isMetaInstruction())
+ continue;
+ if (Instr.isTerminator())
+ return;
+ if (TotalCyclesBetweenCandidates > NumMFMACycles)
+ return;
+ if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
+ if (hasReadWriteDependencies(BeginMI, Instr))
+ continue;
+
+ // If it is a packed instruction, we should subtract it's latency from the
+ // overall latency calculation here, because the packed instruction will
+ // be removed and replaced by 2 unpacked instructions
+ TotalCyclesBetweenCandidates -=
+ SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+ // We're adding 2 to account for the extra latency added by unpacking into
+ // 2 instructions. At the time of writing, the considered unpacked
+ // instructions have latency of 1.
+ // TODO: improve latency handling of possible inserted instructions
+ TotalCyclesBetweenCandidates += 2;
+ // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) {
+ InstrsToUnpack.insert(&Instr);
+ // }
+ }
+ }
+ return;
+}
+
+void SIPreEmitPeephole::processF32Unpacking(MachineInstr &I) {
+ if (SIInstrInfo::modifiesModeRegister(I) ||
+ I.modifiesRegister(AMDGPU::EXEC, TRI))
+ return;
+ bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
+ insertUnpackedF32MI(I, IsFMA);
+ return;
+}
+
+void SIPreEmitPeephole::insertUnpackedF32MI(MachineInstr &I, bool IsFMA) {
+ MachineBasicBlock &MBB = *I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register DstReg = I.getOperand(0).getReg();
+
+ uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+ if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
+ return;
+
+ MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
+ MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
+ if (I.getOperand(0).isUndef())
+ Op0L_Op1L->getOperand(0).setIsUndef();
+
+ MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
+ MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
+
+ if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
+ Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ }
+ if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
+ Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
+ Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+ }
+ if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) {
+ Op0L_Op1L.getInstr()->getOperand(0).setIsRenamable(true);
+ Op0H_Op1H.getInstr()->getOperand(0).setIsRenamable(true);
+ }
+
+ I.eraseFromParent();
+ return;
+}
+
+MachineInstrBuilder
+SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
+ const DebugLoc &DL, uint16_t UnpackedOpcode,
+ bool IsHiBits, bool IsFMA) {
+ MachineOperand &DstMO = I.getOperand(0);
+ MachineOperand &SrcMO1 = I.getOperand(2);
+ MachineOperand &SrcMO2 = I.getOperand(4);
+ Register DstReg = DstMO.getReg();
+ const TargetRegisterInfo *RI =
+ MBB.getParent()->getSubtarget().getRegisterInfo();
+ Register UnpackedDstReg = IsHiBits ? RI->getSubReg(DstReg, AMDGPU::sub1)
+ : RI->getSubReg(DstReg, AMDGPU::sub0);
+
+ int ClampIdx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+ int64_t ClampVal = I.getOperand(ClampIdx).getImm();
+ int Src0_modifiers_Idx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+ int Src1_modifiers_Idx =
+ AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+
+ unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
+ unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+ // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
+ unsigned New_Src0_Mods = 0;
+ unsigned New_Src1_Mods = 0;
+
+ unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+ unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+
+ MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
+ NewMI.addDef(UnpackedDstReg); // vdst
+ addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
+ addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);
+
+ if (IsFMA) {
+ MachineOperand &SrcMO3 = I.getOperand(6);
+ int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
+ I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+ unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
+ addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
+ }
+ NewMI.addImm(ClampVal); // clamp
+ // Packed instructions do not support output modifiers. safe to assign them 0
+ // for this use case
+ NewMI.addImm(0); // omod
+ return NewMI;
+}
+
PreservedAnalyses
llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
@@ -430,6 +695,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
bool Changed = false;
MF.RenumberBlocks();
@@ -461,7 +727,21 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// and limit the distance to 20 instructions for compile time purposes.
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
// may be bundled with the instructions they modify.
+ //
+ // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+ // to co-issue unpacked instructions with MFMA
+ uint16_t NumMFMACycles = 0;
+ auto SchedModel = TII->getSchedModel();
+ SetVector<MachineInstr *> InstrsToUnpack;
+
for (auto &MI : make_early_inc_range(MBB.instrs())) {
+ if (SIInstrInfo::isMFMA(MI)) {
+ const MCSchedClassDesc *SchedClassDesc =
+ SchedModel.resolveSchedClass(&MI);
+ NumMFMACycles =
+ SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+ createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
+ }
if (Count == Threshold)
SetGPRMI = nullptr;
else
@@ -481,6 +761,11 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
else
SetGPRMI = &MI;
}
+ if (!InstrsToUnpack.empty()) {
+ for (MachineInstr *MI : InstrsToUnpack) {
+ processF32Unpacking(*MI);
+ }
+ }
}
return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
new file mode 100644
index 0000000000000..532344ea9cbd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: test_pk_mul_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_pk_mul_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_op_sel_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5'}
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_op_sel_hi_selection_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5'}
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: S_WAITCNT 49279
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
>From fb05da7a164c5227f8cda6c7055ce879c98c7844 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 11 Sep 2025 09:32:08 -0500
Subject: [PATCH 02/16] format error fix and unpack candidate selection
condition change
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 68 ++++++++++----------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 5 +-
3 files changed, 37 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8fce521da157e..2048f61e1486a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6383,40 +6383,40 @@ bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- case AMDGPU::V_CVT_PK_BF8_F32_e64:
- case AMDGPU::V_CVT_PK_FP8_F32_e64:
- case AMDGPU::V_MQSAD_PK_U16_U8_e64:
- case AMDGPU::V_MQSAD_U32_U8_e64:
- case AMDGPU::V_PK_ADD_F16:
- case AMDGPU::V_PK_ADD_F32:
- case AMDGPU::V_PK_ADD_I16:
- case AMDGPU::V_PK_ADD_U16:
- case AMDGPU::V_PK_ASHRREV_I16:
- case AMDGPU::V_PK_FMA_F16:
- case AMDGPU::V_PK_FMA_F32:
- case AMDGPU::V_PK_FMAC_F16_e32:
- case AMDGPU::V_PK_FMAC_F16_e64:
- case AMDGPU::V_PK_LSHLREV_B16:
- case AMDGPU::V_PK_LSHRREV_B16:
- case AMDGPU::V_PK_MAD_I16:
- case AMDGPU::V_PK_MAD_U16:
- case AMDGPU::V_PK_MAX_F16:
- case AMDGPU::V_PK_MAX_I16:
- case AMDGPU::V_PK_MAX_U16:
- case AMDGPU::V_PK_MIN_F16:
- case AMDGPU::V_PK_MIN_I16:
- case AMDGPU::V_PK_MIN_U16:
- case AMDGPU::V_PK_MOV_B32:
- case AMDGPU::V_PK_MUL_F16:
- case AMDGPU::V_PK_MUL_F32:
- case AMDGPU::V_PK_MUL_LO_U16:
- case AMDGPU::V_PK_SUB_I16:
- case AMDGPU::V_PK_SUB_U16:
- case AMDGPU::V_QSAD_PK_U16_U8_e64:
- return true;
- default:
- return false;
- }
+ case AMDGPU::V_CVT_PK_BF8_F32_e64:
+ case AMDGPU::V_CVT_PK_FP8_F32_e64:
+ case AMDGPU::V_MQSAD_PK_U16_U8_e64:
+ case AMDGPU::V_MQSAD_U32_U8_e64:
+ case AMDGPU::V_PK_ADD_F16:
+ case AMDGPU::V_PK_ADD_F32:
+ case AMDGPU::V_PK_ADD_I16:
+ case AMDGPU::V_PK_ADD_U16:
+ case AMDGPU::V_PK_ASHRREV_I16:
+ case AMDGPU::V_PK_FMA_F16:
+ case AMDGPU::V_PK_FMA_F32:
+ case AMDGPU::V_PK_FMAC_F16_e32:
+ case AMDGPU::V_PK_FMAC_F16_e64:
+ case AMDGPU::V_PK_LSHLREV_B16:
+ case AMDGPU::V_PK_LSHRREV_B16:
+ case AMDGPU::V_PK_MAD_I16:
+ case AMDGPU::V_PK_MAD_U16:
+ case AMDGPU::V_PK_MAX_F16:
+ case AMDGPU::V_PK_MAX_I16:
+ case AMDGPU::V_PK_MAX_U16:
+ case AMDGPU::V_PK_MIN_F16:
+ case AMDGPU::V_PK_MIN_I16:
+ case AMDGPU::V_PK_MIN_U16:
+ case AMDGPU::V_PK_MOV_B32:
+ case AMDGPU::V_PK_MUL_F16:
+ case AMDGPU::V_PK_MUL_F32:
+ case AMDGPU::V_PK_MUL_LO_U16:
+ case AMDGPU::V_PK_SUB_I16:
+ case AMDGPU::V_PK_SUB_U16:
+ case AMDGPU::V_QSAD_PK_U16_U8_e64:
+ return true;
+ default:
+ return false;
+ }
}
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d0b49ffc19600..2f512eac41911 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1201,7 +1201,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
}
bool isNeverCoissue(MachineInstr &MI) const;
-
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index cad096e0d2fcc..e3d9ac7b031d2 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -582,9 +582,8 @@ void SIPreEmitPeephole::createListOfPackedInstr(
// instructions have latency of 1.
// TODO: improve latency handling of possible inserted instructions
TotalCyclesBetweenCandidates += 2;
- // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) {
- InstrsToUnpack.insert(&Instr);
- // }
+ if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
+ InstrsToUnpack.insert(&Instr);
}
}
return;
>From bbefcfb618ac571e7198b927f8dc1e91a3f37788 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 11 Sep 2025 10:34:29 -0500
Subject: [PATCH 03/16] code comments, variable and function re-naming, minor
code optimizations
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 96 ++++++++++----------
1 file changed, 48 insertions(+), 48 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index e3d9ac7b031d2..caa16bfd573ea 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -53,9 +53,10 @@ class SIPreEmitPeephole {
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
- void createListOfPackedInstr(MachineInstr &BeginMI,
- SetVector<MachineInstr *> &InstrsToUnpack,
- uint16_t NumMFMACycles);
+ void
+ selectSuitableInstrsForUnpacking(MachineInstr &BeginMI,
+ SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles);
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Conservatively ensures
// that we do not incorrectly read/write registers.
@@ -64,10 +65,10 @@ class SIPreEmitPeephole {
// Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
// Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
// transformation.
- void processF32Unpacking(MachineInstr &I);
+ void performF32Unpacking(MachineInstr &I);
// Insert appropriate unpacked instructions into the BB
void insertUnpackedF32MI(MachineInstr &I, bool IsFMA);
- // Select corresponding unpacked instruction from packed instruction as input
+ // Select corresponding unpacked instruction from packed instruction as I
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
@@ -76,7 +77,9 @@ class SIPreEmitPeephole {
const DebugLoc &DL,
uint16_t UnpackedOpcode, bool IsHiBits,
bool IsFMA);
- void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+ // process operands/source modifiers from packed instructions and insert the
+ // appropriate source modifers and operands into the unpacked instructions
+ void addOperandAndMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
unsigned NegModifier, unsigned OpSelModifier,
MachineOperand &SrcMO);
@@ -470,21 +473,20 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
const MachineInstr &SuccMI) {
- for (const MachineOperand &Pred_Ops : PredMI.operands()) {
- if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
+ for (const MachineOperand &PredOps : PredMI.operands()) {
+ if (!PredOps.isReg() || !PredOps.isDef())
continue;
- Register Pred_Reg = Pred_Ops.getReg();
- if (!Pred_Reg.isValid())
+ Register PredReg = PredOps.getReg();
+ if (!PredReg.isValid())
continue;
- for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
- if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
+ for (const MachineOperand &SuccOps : SuccMI.operands()) {
+ if (!SuccOps.isReg() || !SuccOps.isDef())
continue;
- Register Succ_Reg = Succ_Ops.getReg();
- if (!Succ_Reg.isValid())
+ Register SuccReg = SuccOps.getReg();
+ if (!SuccReg.isValid())
continue;
- if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
+ if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg))
return true;
- }
}
}
return false;
@@ -508,12 +510,12 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
llvm_unreachable("Fully covered switch");
}
-void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI,
+void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
unsigned Src_Mods,
unsigned NegModifier,
unsigned OpSelModifier,
MachineOperand &SrcMO) {
- unsigned New_Src_Mods = 0;
+ unsigned NewSrcMods = 0;
const TargetRegisterInfo *RI = SrcMO.getParent()
->getParent()
->getParent()
@@ -526,29 +528,28 @@ void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI,
// modifier for the higher 32 bits. Unpacked VOP3 instructions do support
// ABS, therefore we need to explicitly add the NEG modifier if present in
// the packed instruction
- if (Src_Mods & NegModifier) {
- New_Src_Mods |= SISrcMods::NEG;
- }
+ if (Src_Mods & NegModifier)
+ NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
// done below. Unpacked operations support abs, but packed instructions do
// not. Thus, abs is not handled.
- NewMI.addImm(New_Src_Mods);
+ NewMI.addImm(NewSrcMods);
if (SrcMO.isImm()) {
NewMI.addImm(SrcMO.getImm());
- } else {
- // If op_sel == 0, select register 0 of reg:sub0_sub1
- Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
- ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
- : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
- if (SrcMO.isReg() && SrcMO.isKill())
- NewMI.addReg(UnpackedSrcReg, RegState::Kill);
- else
- NewMI.addReg(UnpackedSrcReg);
+ return;
}
+ // If op_sel == 0, select register 0 of reg:sub0_sub1
+ Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
+ ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+ : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+ if (SrcMO.isReg() && SrcMO.isKill())
+ NewMI.addReg(UnpackedSrcReg, RegState::Kill);
+ else
+ NewMI.addReg(UnpackedSrcReg);
}
-void SIPreEmitPeephole::createListOfPackedInstr(
+void SIPreEmitPeephole::selectSuitableInstrsForUnpacking(
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles) {
auto *BB = BeginMI.getParent();
@@ -589,7 +590,7 @@ void SIPreEmitPeephole::createListOfPackedInstr(
return;
}
-void SIPreEmitPeephole::processF32Unpacking(MachineInstr &I) {
+void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
if (SIInstrInfo::modifiesModeRegister(I) ||
I.modifiesRegister(AMDGPU::EXEC, TRI))
return;
@@ -648,31 +649,31 @@ SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
int ClampIdx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
int64_t ClampVal = I.getOperand(ClampIdx).getImm();
- int Src0_modifiers_Idx =
+ int Src0ModifiersIdx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
- int Src1_modifiers_Idx =
+ int Src1ModifiersIdx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
- unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
- unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
+ unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm();
+ unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm();
// Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
- unsigned New_Src0_Mods = 0;
- unsigned New_Src1_Mods = 0;
+ unsigned NewSrc0Mods = 0;
+ unsigned NewSrc1Mods = 0;
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
- addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, SrcMO1);
+ addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, SrcMO2);
if (IsFMA) {
MachineOperand &SrcMO3 = I.getOperand(6);
- int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
+ int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(
I.getOpcode(), AMDGPU::OpName::src2_modifiers);
- unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
- addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
+ unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm();
+ addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -739,7 +740,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
SchedModel.resolveSchedClass(&MI);
NumMFMACycles =
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
- createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
+ selectSuitableInstrsForUnpacking(MI, InstrsToUnpack, NumMFMACycles);
}
if (Count == Threshold)
SetGPRMI = nullptr;
@@ -761,9 +762,8 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
SetGPRMI = &MI;
}
if (!InstrsToUnpack.empty()) {
- for (MachineInstr *MI : InstrsToUnpack) {
- processF32Unpacking(*MI);
- }
+ for (MachineInstr *MI : InstrsToUnpack)
+ performF32Unpacking(*MI);
}
}
>From f71631d5a0b19c232a69c78ee744e28e5c4ef06e Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 11 Sep 2025 12:25:39 -0500
Subject: [PATCH 04/16] MIR tests for pk_add and pk_fma
---
...ck-non-coissue-insts-post-ra-scheduler.mir | 72 +++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 532344ea9cbd5..3a6afc108ff7e 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -149,3 +149,75 @@ body: |
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
+
+...
+---
+name: test_pk_add_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+
+...
+---
+name: test_pk_fma_unpacking_f32
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
>From 6f5c6a529603fb227aee49940c628bca38516e56 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 12 Sep 2025 13:41:49 -0500
Subject: [PATCH 05/16] check if unpacking can introduce dependencies, improve
candidate selection, code cleanup
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 233 ++++++++++--------
...ck-non-coissue-insts-post-ra-scheduler.mir | 117 ++++++++-
2 files changed, 247 insertions(+), 103 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index caa16bfd573ea..86290fa8e01e1 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -36,7 +36,6 @@ class SIPreEmitPeephole {
private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
- MachineRegisterInfo *MRI;
bool optimizeVccBranch(MachineInstr &MI) const;
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
@@ -53,33 +52,36 @@ class SIPreEmitPeephole {
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
- void
- selectSuitableInstrsForUnpacking(MachineInstr &BeginMI,
- SetVector<MachineInstr *> &InstrsToUnpack,
- uint16_t NumMFMACycles);
+ void collectUnpackingCandidates(MachineInstr &BeginMI,
+ SetVector<MachineInstr *> &InstrsToUnpack,
+ uint16_t NumMFMACycles);
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Conservatively ensures
// that we do not incorrectly read/write registers.
- bool hasReadWriteDependencies(const MachineInstr &PredMI,
- const MachineInstr &SuccMI);
- // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
- // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
- // transformation.
+ bool hasRWDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI);
+ // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
+ // op_sel_hi:[0,0,0]
+ // ==>
+ // v_fma_f32 v0, v1, v3, v3
+ // v_fma_f32 v1, v0, v2, v2
+ // here, we have overwritten v0 before we use it. This function checks if
+ // unpacking can lead to such a situation
+ bool canUnpackingIntroduceDependencies(const MachineInstr &MI);
+ // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
+ // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
+ // this transformation.
void performF32Unpacking(MachineInstr &I);
- // Insert appropriate unpacked instructions into the BB
- void insertUnpackedF32MI(MachineInstr &I, bool IsFMA);
// Select corresponding unpacked instruction from packed instruction as I
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
// instruction
- MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
- const DebugLoc &DL,
- uint16_t UnpackedOpcode, bool IsHiBits,
- bool IsFMA);
+ MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
+ bool IsHiBits);
// process operands/source modifiers from packed instructions and insert the
// appropriate source modifers and operands into the unpacked instructions
- void addOperandAndMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
+ void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods,
unsigned NegModifier, unsigned OpSelModifier,
MachineOperand &SrcMO);
@@ -471,8 +473,8 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
llvm_unreachable("Fully covered switch");
}
-bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
- const MachineInstr &SuccMI) {
+bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI,
+ const MachineInstr &SuccMI) {
for (const MachineOperand &PredOps : PredMI.operands()) {
if (!PredOps.isReg() || !PredOps.isDef())
continue;
@@ -480,7 +482,7 @@ bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
if (!PredReg.isValid())
continue;
for (const MachineOperand &SuccOps : SuccMI.operands()) {
- if (!SuccOps.isReg() || !SuccOps.isDef())
+ if (!SuccOps.isReg())
continue;
Register SuccReg = SuccOps.getReg();
if (!SuccReg.isValid())
@@ -492,11 +494,54 @@ bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI,
return false;
}
+bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
+ const MachineInstr &MI) {
+ unsigned OpCode = MI.getOpcode();
+ bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false;
+ MachineOperand DstMO = MI.getOperand(0);
+ Register DstReg = DstMO.getReg();
+ Register SrcReg0 = MI.getOperand(2).getReg();
+ Register SrcReg1 = MI.getOperand(4).getReg();
+
+ Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
+ int Src0ModifiersIdx =
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
+ int Src1ModifiersIdx =
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
+ unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm();
+ unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm();
+
+ Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+ Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+ if (UnpackedDstReg == HiSrc0Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg) ||
+ UnpackedDstReg == HiSrc1Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+ return true;
+ if (IsFMA) {
+ int Src2ModifiersIdx =
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
+ unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm();
+ Register SrcReg2 = MI.getOperand(6).getReg();
+ Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+ if (UnpackedDstReg == HiSrc2Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+ return true;
+ }
+ return false;
+}
+
uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
unsigned Opcode = I.getOpcode();
// Use 64 bit encoding to allow use of VOP3 instructions.
- // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
- // e32 instructions are VOP2 and don't allow source modifiers
+ // VOP3 e64 instructions allow source modifiers
+ // e32 instructions don't allow source modifiers
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
return AMDGPU::V_ADD_F32_e64;
@@ -511,16 +556,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
}
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
- unsigned Src_Mods,
+ unsigned SrcMods,
unsigned NegModifier,
unsigned OpSelModifier,
MachineOperand &SrcMO) {
unsigned NewSrcMods = 0;
- const TargetRegisterInfo *RI = SrcMO.getParent()
- ->getParent()
- ->getParent()
- ->getSubtarget()
- .getRegisterInfo();
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
// lane.
// NEG_HI shares the same bit position with ABS. But packed instructions do
@@ -528,7 +568,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
// modifier for the higher 32 bits. Unpacked VOP3 instructions do support
// ABS, therefore we need to explicitly add the NEG modifier if present in
// the packed instruction
- if (Src_Mods & NegModifier)
+ if (SrcMods & NegModifier)
NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
@@ -540,16 +580,16 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
return;
}
// If op_sel == 0, select register 0 of reg:sub0_sub1
- Register UnpackedSrcReg = (Src_Mods & OpSelModifier)
- ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
- : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
+ Register UnpackedSrcReg = (SrcMods & OpSelModifier)
+ ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
+ : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
if (SrcMO.isReg() && SrcMO.isKill())
NewMI.addReg(UnpackedSrcReg, RegState::Kill);
else
NewMI.addReg(UnpackedSrcReg);
}
-void SIPreEmitPeephole::selectSuitableInstrsForUnpacking(
+void SIPreEmitPeephole::collectUnpackingCandidates(
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles) {
auto *BB = BeginMI.getParent();
@@ -558,21 +598,25 @@ void SIPreEmitPeephole::selectSuitableInstrsForUnpacking(
auto SchedModel = TII->getSchedModel();
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
+ if (Instr.isMetaInstruction())
+ continue;
+ if (Instr.isTerminator())
+ return;
+ if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr))
+ return;
const MCSchedClassDesc *InstrSchedClassDesc =
SchedModel.resolveSchedClass(&Instr);
TotalCyclesBetweenCandidates +=
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
- if (Instr.isMetaInstruction())
- continue;
- if (Instr.isTerminator())
- return;
if (TotalCyclesBetweenCandidates > NumMFMACycles)
return;
- if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
- if (hasReadWriteDependencies(BeginMI, Instr))
+ if (isUnpackingSupportedInstr(Instr)) {
+ assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
+ if (hasRWDependencies(BeginMI, Instr))
+ return;
+ if (canUnpackingIntroduceDependencies(Instr))
continue;
-
// If it is a packed instruction, we should subtract it's latency from the
// overall latency calculation here, because the packed instruction will
// be removed and replaced by 2 unpacked instructions
@@ -591,74 +635,65 @@ void SIPreEmitPeephole::selectSuitableInstrsForUnpacking(
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
- if (SIInstrInfo::modifiesModeRegister(I) ||
- I.modifiesRegister(AMDGPU::EXEC, TRI))
- return;
- bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
- insertUnpackedF32MI(I, IsFMA);
- return;
-}
-
-void SIPreEmitPeephole::insertUnpackedF32MI(MachineInstr &I, bool IsFMA) {
MachineBasicBlock &MBB = *I.getParent();
- const DebugLoc &DL = I.getDebugLoc();
- Register DstReg = I.getOperand(0).getReg();
+ MachineOperand DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
return;
- MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
- MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
- if (I.getOperand(0).isUndef())
- Op0L_Op1L->getOperand(0).setIsUndef();
+ MachineInstrBuilder Op0LOp1L =
+ createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
+ MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
+
+ if (DstOp.isUndef())
+ LoDstOp.setIsUndef();
- MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
- MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
+ MachineInstrBuilder Op0HOp1H =
+ createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
+ MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
- Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
- Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
+ Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
}
if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
- Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
- Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
+ Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
+ Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
}
if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) {
- Op0L_Op1L.getInstr()->getOperand(0).setIsRenamable(true);
- Op0H_Op1H.getInstr()->getOperand(0).setIsRenamable(true);
+ LoDstOp.setIsRenamable(true);
+ HiDstOp.setIsRenamable(true);
}
I.eraseFromParent();
return;
}
-MachineInstrBuilder
-SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
- const DebugLoc &DL, uint16_t UnpackedOpcode,
- bool IsHiBits, bool IsFMA) {
+MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
+ uint16_t UnpackedOpcode,
+ bool IsHiBits) {
+ MachineBasicBlock &MBB = *I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
MachineOperand &DstMO = I.getOperand(0);
MachineOperand &SrcMO1 = I.getOperand(2);
MachineOperand &SrcMO2 = I.getOperand(4);
Register DstReg = DstMO.getReg();
- const TargetRegisterInfo *RI =
- MBB.getParent()->getSubtarget().getRegisterInfo();
- Register UnpackedDstReg = IsHiBits ? RI->getSubReg(DstReg, AMDGPU::sub1)
- : RI->getSubReg(DstReg, AMDGPU::sub0);
+ unsigned OpCode = I.getOpcode();
+ Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
+ : TRI->getSubReg(DstReg, AMDGPU::sub0);
- int ClampIdx =
- AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
+ bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false;
+ int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp);
int64_t ClampVal = I.getOperand(ClampIdx).getImm();
int Src0ModifiersIdx =
- AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
int Src1ModifiersIdx =
- AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm();
unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm();
// Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
- unsigned NewSrc0Mods = 0;
- unsigned NewSrc1Mods = 0;
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
@@ -670,8 +705,8 @@ SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
if (IsFMA) {
MachineOperand &SrcMO3 = I.getOperand(6);
- int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(
- I.getOpcode(), AMDGPU::OpName::src2_modifiers);
+ int Src2ModifiersIdx =
+ AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm();
addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3);
}
@@ -695,12 +730,33 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- MRI = &MF.getRegInfo();
bool Changed = false;
MF.RenumberBlocks();
for (MachineBasicBlock &MBB : MF) {
+ // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+ // to co-issue unpacked instructions with MFMA
+ uint16_t NumMFMACycles = 0;
+ auto SchedModel = TII->getSchedModel();
+ SetVector<MachineInstr *> InstrsToUnpack;
+ for (auto &MI : make_early_inc_range(MBB.instrs())) {
+ if (SIInstrInfo::isMFMA(MI)) {
+ const MCSchedClassDesc *SchedClassDesc =
+ SchedModel.resolveSchedClass(&MI);
+ NumMFMACycles =
+ SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+ collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+ }
+ }
+ if (!InstrsToUnpack.empty()) {
+ for (MachineInstr *MI : InstrsToUnpack) {
+ if (!SIInstrInfo::modifiesModeRegister(*MI) &&
+ !MI->modifiesRegister(AMDGPU::EXEC, TRI))
+ performF32Unpacking(*MI);
+ }
+ }
+
MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
// Check first terminator for branches to optimize
if (TermI != MBB.end()) {
@@ -728,20 +784,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
// may be bundled with the instructions they modify.
//
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
- uint16_t NumMFMACycles = 0;
- auto SchedModel = TII->getSchedModel();
- SetVector<MachineInstr *> InstrsToUnpack;
-
for (auto &MI : make_early_inc_range(MBB.instrs())) {
- if (SIInstrInfo::isMFMA(MI)) {
- const MCSchedClassDesc *SchedClassDesc =
- SchedModel.resolveSchedClass(&MI);
- NumMFMACycles =
- SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
- selectSuitableInstrsForUnpacking(MI, InstrsToUnpack, NumMFMACycles);
- }
if (Count == Threshold)
SetGPRMI = nullptr;
else
@@ -761,10 +804,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
else
SetGPRMI = &MI;
}
- if (!InstrsToUnpack.empty()) {
- for (MachineInstr *MI : InstrsToUnpack)
- performF32Unpacking(*MI);
- }
}
return Changed;
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 3a6afc108ff7e..9e520b54b3b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -29,8 +29,8 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
@@ -79,8 +79,8 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
@@ -129,8 +129,8 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
@@ -161,6 +161,44 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_pk_add_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ ; GCN-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ ; GCN-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GCN-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
@@ -206,6 +244,25 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_pk_fma_unpacking_f32
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -221,3 +278,51 @@ body: |
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
+
+...
+---
+name: test_unpacking_does_not_introduce_rw_dependency
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
>From ea85eb788535ce08ac4940f1d59e6619d54cdff0 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 12 Sep 2025 13:47:37 -0500
Subject: [PATCH 06/16] update MIR test cmd
---
.../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 9e520b54b3b5d..d45658fb415ea 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
---
name: test_pk_mul_unpacking_f32
>From 4995f00271cd1e1fb5233102e271b9f3bf4d0a90 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 12 Sep 2025 17:34:31 -0500
Subject: [PATCH 07/16] remove use of hard coded operand idx
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 32 +++++++++-----------
1 file changed, 15 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 86290fa8e01e1..5c353481e6134 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -83,7 +83,7 @@ class SIPreEmitPeephole {
// appropriate source modifers and operands into the unpacked instructions
void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods,
unsigned NegModifier, unsigned OpSelModifier,
- MachineOperand &SrcMO);
+ const MachineOperand &SrcMO);
public:
bool run(MachineFunction &MF);
@@ -497,11 +497,11 @@ bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI,
bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
- bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false;
+ bool IsFMA = OpCode == AMDGPU::V_PK_FMA_F32;
MachineOperand DstMO = MI.getOperand(0);
Register DstReg = DstMO.getReg();
- Register SrcReg0 = MI.getOperand(2).getReg();
- Register SrcReg1 = MI.getOperand(4).getReg();
+ Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+ Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
int Src0ModifiersIdx =
@@ -522,7 +522,7 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
UnpackedDstReg == HiSrc1Reg ||
TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
return true;
- if (IsFMA) {
+ if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
int Src2ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm();
@@ -559,7 +559,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
unsigned SrcMods,
unsigned NegModifier,
unsigned OpSelModifier,
- MachineOperand &SrcMO) {
+ const MachineOperand &SrcMO) {
unsigned NewSrcMods = 0;
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
// lane.
@@ -616,7 +616,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (hasRWDependencies(BeginMI, Instr))
return;
if (canUnpackingIntroduceDependencies(Instr))
- continue;
+ return;
// If it is a packed instruction, we should subtract it's latency from the
// overall latency calculation here, because the packed instruction will
// be removed and replaced by 2 unpacked instructions
@@ -635,7 +635,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
}
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
- MachineBasicBlock &MBB = *I.getParent();
MachineOperand DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
@@ -661,7 +660,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
}
- if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) {
+ if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) {
LoDstOp.setIsRenamable(true);
HiDstOp.setIsRenamable(true);
}
@@ -676,14 +675,13 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
MachineOperand &DstMO = I.getOperand(0);
- MachineOperand &SrcMO1 = I.getOperand(2);
- MachineOperand &SrcMO2 = I.getOperand(4);
+ const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+ const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = DstMO.getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
: TRI->getSubReg(DstReg, AMDGPU::sub0);
- bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false;
int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp);
int64_t ClampVal = I.getOperand(ClampIdx).getImm();
int Src0ModifiersIdx =
@@ -700,15 +698,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, *SrcMO1);
+ addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, *SrcMO2);
- if (IsFMA) {
- MachineOperand &SrcMO3 = I.getOperand(6);
+ if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+ const MachineOperand *SrcMO3 = TII->getNamedOperand(I, AMDGPU::OpName::src2);
int Src2ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm();
- addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, *SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
>From 05d0c75e120ce172775950d71da86d272ebd748d Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Fri, 12 Sep 2025 17:41:16 -0500
Subject: [PATCH 08/16] update test
---
.../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index d45658fb415ea..0c1e063255cdf 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -307,8 +307,7 @@ body: |
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
>From 88e3c6d639a066e1cbfeea5fdce6873b73d3683a Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 16 Sep 2025 15:12:34 -0500
Subject: [PATCH 09/16] direct and transitive dependency on MFMA def, reg kill
logic, new tests
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 205 ++++++++++--------
...ck-non-coissue-insts-post-ra-scheduler.mir | 170 ++++++++++++++-
2 files changed, 275 insertions(+), 100 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 5c353481e6134..d4db028dc1758 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -55,11 +55,6 @@ class SIPreEmitPeephole {
void collectUnpackingCandidates(MachineInstr &BeginMI,
SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles);
- // Identify register dependencies between those used by the MFMA
- // instruction and the following packed instructions. Conservatively ensures
- // that we do not incorrectly read/write registers.
- bool hasRWDependencies(const MachineInstr &PredMI,
- const MachineInstr &SuccMI);
// v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
// op_sel_hi:[0,0,0]
// ==>
@@ -82,8 +77,7 @@ class SIPreEmitPeephole {
// process operands/source modifiers from packed instructions and insert the
// appropriate source modifers and operands into the unpacked instructions
void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods,
- unsigned NegModifier, unsigned OpSelModifier,
- const MachineOperand &SrcMO);
+ bool IsHiBits, const MachineOperand &SrcMO);
public:
bool run(MachineFunction &MF);
@@ -460,6 +454,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return true;
}
+// If support is extended to new operations, add tests in
+// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
@@ -473,36 +469,10 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
llvm_unreachable("Fully covered switch");
}
-bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI,
- const MachineInstr &SuccMI) {
- for (const MachineOperand &PredOps : PredMI.operands()) {
- if (!PredOps.isReg() || !PredOps.isDef())
- continue;
- Register PredReg = PredOps.getReg();
- if (!PredReg.isValid())
- continue;
- for (const MachineOperand &SuccOps : SuccMI.operands()) {
- if (!SuccOps.isReg())
- continue;
- Register SuccReg = SuccOps.getReg();
- if (!SuccReg.isValid())
- continue;
- if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg))
- return true;
- }
- }
- return false;
-}
-
bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
- bool IsFMA = OpCode == AMDGPU::V_PK_FMA_F32;
- MachineOperand DstMO = MI.getOperand(0);
- Register DstReg = DstMO.getReg();
- Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
- Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
-
+ Register DstReg = MI.getOperand(0).getReg();
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
int Src0ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
@@ -511,28 +481,40 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm();
unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm();
- Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
- ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
- : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
- Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
- ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
- : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
- if (UnpackedDstReg == HiSrc0Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg) ||
- UnpackedDstReg == HiSrc1Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
- return true;
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src0)->isReg()) {
+ Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+ Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+ if (UnpackedDstReg == HiSrc0Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
+ return true;
+ }
+
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isReg()) {
+ Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+ Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+ if (UnpackedDstReg == HiSrc1Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+ return true;
+ }
+
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
int Src2ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm();
- Register SrcReg2 = MI.getOperand(6).getReg();
- Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
- ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
- : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
- if (UnpackedDstReg == HiSrc2Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
- return true;
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src2)->isReg()) {
+ Register SrcReg2 =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+ ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+ : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+ if (UnpackedDstReg == HiSrc2Reg ||
+ TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+ return true;
+ }
}
return false;
}
@@ -556,11 +538,13 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
}
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
- unsigned SrcMods,
- unsigned NegModifier,
- unsigned OpSelModifier,
+ unsigned SrcMods, bool IsHiBits,
const MachineOperand &SrcMO) {
unsigned NewSrcMods = 0;
+ unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
+ unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
+ // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
+ // for ABS modifiers.
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
// lane.
// NEG_HI shares the same bit position with ABS. But packed instructions do
@@ -583,10 +567,30 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
Register UnpackedSrcReg = (SrcMods & OpSelModifier)
? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
: TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
- if (SrcMO.isReg() && SrcMO.isKill())
- NewMI.addReg(UnpackedSrcReg, RegState::Kill);
- else
- NewMI.addReg(UnpackedSrcReg);
+
+ MachineOperand UnpackedSrcMO =
+ MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
+ if (SrcMO.isKill()) {
+ // For each unpacked instruction, mark its source registers as killed if the
+ // corresponding source register in the original packed instruction was
+ // marked as killed.
+ //
+ // Exception:
+ // If the op_sel and op_sel_hi modifiers require both unpacked instructions
+ // to use the same register (e.g., due to overlapping access to low/high
+ // bits of the same packed register), then only the *second* (latter)
+ // instruction should mark the register as killed. This is because the
+ // second instruction handles the higher bits and is effectively the last
+ // user of the full register pair.
+
+ bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
+ bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
+ bool KillState = true;
+ if ((OpSel == OpSelHi) && !IsHiBits)
+ KillState = false;
+ UnpackedSrcMO.setIsKill(KillState);
+ }
+ NewMI.add(UnpackedSrcMO);
}
void SIPreEmitPeephole::collectUnpackingCandidates(
@@ -596,6 +600,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
auto E = BB->end();
int TotalCyclesBetweenCandidates = 0;
auto SchedModel = TII->getSchedModel();
+ SetVector<Register> RegsOverlappedByMFMADef;
+ RegsOverlappedByMFMADef.insert(BeginMI.getOperand(0).getReg());
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
if (Instr.isMetaInstruction())
@@ -611,10 +617,25 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TotalCyclesBetweenCandidates > NumMFMACycles)
return;
+ // Identify register dependencies between those used by the MFMA
+ // instruction and the following packed instructions. Also checks for
+ // transitive dependencies between the MFMA def and candidate instruction
+ // def and uses. Conservatively ensures that we do not incorrectly
+ // read/write registers.
+ for (const MachineOperand &InstrMO : Instr.operands()) {
+ if (InstrMO.isReg() && !InstrMO.isDef()) {
+ for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) {
+ if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) {
+ if (isUnpackingSupportedInstr(Instr))
+ return;
+ RegsOverlappedByMFMADef.insert(Instr.getOperand(0).getReg());
+ break;
+ }
+ }
+ }
+ }
if (isUnpackingSupportedInstr(Instr)) {
assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
- if (hasRWDependencies(BeginMI, Instr))
- return;
if (canUnpackingIntroduceDependencies(Instr))
return;
// If it is a packed instruction, we should subtract it's latency from the
@@ -674,10 +695,9 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand &DstMO = I.getOperand(0);
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
- Register DstReg = DstMO.getReg();
+ Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
: TRI->getSubReg(DstReg, AMDGPU::sub0);
@@ -691,22 +711,19 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm();
unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm();
- // Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
-
- unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
- unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
- addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, *SrcMO1);
- addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, *SrcMO2);
+ addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
+ addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- const MachineOperand *SrcMO3 = TII->getNamedOperand(I, AMDGPU::OpName::src2);
+ const MachineOperand *SrcMO3 =
+ TII->getNamedOperand(I, AMDGPU::OpName::src2);
int Src2ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm();
- addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, *SrcMO3);
+ addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -733,28 +750,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
MF.RenumberBlocks();
for (MachineBasicBlock &MBB : MF) {
- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
- // to co-issue unpacked instructions with MFMA
- uint16_t NumMFMACycles = 0;
- auto SchedModel = TII->getSchedModel();
- SetVector<MachineInstr *> InstrsToUnpack;
- for (auto &MI : make_early_inc_range(MBB.instrs())) {
- if (SIInstrInfo::isMFMA(MI)) {
- const MCSchedClassDesc *SchedClassDesc =
- SchedModel.resolveSchedClass(&MI);
- NumMFMACycles =
- SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
- collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
- }
- }
- if (!InstrsToUnpack.empty()) {
- for (MachineInstr *MI : InstrsToUnpack) {
- if (!SIInstrInfo::modifiesModeRegister(*MI) &&
- !MI->modifiesRegister(AMDGPU::EXEC, TRI))
- performF32Unpacking(*MI);
- }
- }
-
MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
// Check first terminator for branches to optimize
if (TermI != MBB.end()) {
@@ -804,5 +799,29 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
}
}
+ for (MachineBasicBlock &MBB : MF) {
+ // Unpack packed instructions overlapped by MFMAs. This allows the compiler
+ // to co-issue unpacked instructions with MFMA
+ uint16_t NumMFMACycles = 0;
+ auto SchedModel = TII->getSchedModel();
+ SetVector<MachineInstr *> InstrsToUnpack;
+ for (auto &MI : make_early_inc_range(MBB.instrs())) {
+ if (SIInstrInfo::isMFMA(MI)) {
+ const MCSchedClassDesc *SchedClassDesc =
+ SchedModel.resolveSchedClass(&MI);
+ NumMFMACycles =
+ SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+ collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+ }
+ }
+ if (!InstrsToUnpack.empty()) {
+ for (MachineInstr *MI : InstrsToUnpack) {
+ if (!SIInstrInfo::modifiesModeRegister(*MI) &&
+ !MI->modifiesRegister(AMDGPU::EXEC, TRI))
+ performF32Unpacking(*MI);
+ }
+ }
+ }
+
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 0c1e063255cdf..6a9b406e390c0 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -79,7 +79,7 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
@@ -129,7 +129,7 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
@@ -260,7 +260,7 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
@@ -306,8 +306,9 @@ body: |
; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
@@ -322,6 +323,161 @@ body: |
early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
$vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_opcodes_not_supported_for_unpacking_are_skipped
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+
+ $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+
+ $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+
+ S_ENDPGM 0
+
+...
+---
+name: test_opsel_register_is_correctly_marked_as_killed
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_inst_dependent_on_mfma_are_not_unpacked
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ ; GCN-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+ ; GCN: liveins: $sgpr4_sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ ; GCN-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ ; GCN-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GCN-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
>From 56b505fdefbac0598f6735ba53b2505814061e11 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Tue, 16 Sep 2025 15:23:43 -0500
Subject: [PATCH 10/16] update condition for MFMA dependence check
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index d4db028dc1758..6d94476205b78 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -623,7 +623,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// def and uses. Conservatively ensures that we do not incorrectly
// read/write registers.
for (const MachineOperand &InstrMO : Instr.operands()) {
- if (InstrMO.isReg() && !InstrMO.isDef()) {
+ if (InstrMO.isReg()) {
for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) {
if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) {
if (isUnpackingSupportedInstr(Instr))
>From 3ecc5da732faec35795361875bef14c0a440ea8f Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 17 Sep 2025 11:26:17 -0500
Subject: [PATCH 11/16] update code comments
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 21 +++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 6d94476205b78..62d100cef64ec 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -473,6 +473,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
+ // Only the first register in the register pair needs to be checked due to the
+ // unpacking order. Packed instructions are unpacked such that the lower 32
+ // bits (i.e., the first register in the pair) are written first. This can
+ // introduce dependencies if the first register is written in one instruction
+ // and then read as part of the higher 32 bits in the subsequent instruction.
+ // Such scenarios can arise due to specific combinations of op_sel and
+ // op_sel_hi modifiers.
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
int Src0ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
@@ -486,6 +493,8 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+ // Check if the register selected by op_sel_hi is the same as the first
+ // register in the destination register pair
if (UnpackedDstReg == HiSrc0Reg ||
TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
return true;
@@ -501,6 +510,7 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
return true;
}
+ // Applicable for packed instructions with 3 source operands, such as V_PK_FMA
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
int Src2ModifiersIdx =
AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
@@ -549,15 +559,14 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
// lane.
// NEG_HI shares the same bit position with ABS. But packed instructions do
// not support ABS. Therefore, NEG_HI must be translated to NEG source
- // modifier for the higher 32 bits. Unpacked VOP3 instructions do support
- // ABS, therefore we need to explicitly add the NEG modifier if present in
- // the packed instruction
+ // modifier for the higher 32 bits. Unpacked VOP3 instructions support
+ // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
+ // NEG modifier if present in the packed instruction
if (SrcMods & NegModifier)
NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
- // done below. Unpacked operations support abs, but packed instructions do
- // not. Thus, abs is not handled.
+ // done below.
NewMI.addImm(NewSrcMods);
if (SrcMO.isImm()) {
NewMI.addImm(SrcMO.getImm());
@@ -799,6 +808,8 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
}
}
+ // TODO: fold this into previous block, if possible. Evaluate and handle any
+ // side effects.
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
// to co-issue unpacked instructions with MFMA
>From 48cf50cbea2b96851b8eb130a98b35674c4c0621 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Wed, 17 Sep 2025 21:18:25 -0500
Subject: [PATCH 12/16] update return conditions, reduce LOC
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 131 +++++++++----------
1 file changed, 60 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 62d100cef64ec..b2f7780e1e2f9 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -48,7 +48,7 @@ class SIPreEmitPeephole {
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
// Check if the machine instruction being processed is a supported packed
- // instruction
+ // instruction.
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
@@ -60,23 +60,23 @@ class SIPreEmitPeephole {
// ==>
// v_fma_f32 v0, v1, v3, v3
// v_fma_f32 v1, v0, v2, v2
- // here, we have overwritten v0 before we use it. This function checks if
- // unpacking can lead to such a situation
+ // Here, we have overwritten v0 before we use it. This function checks if
+ // unpacking can lead to such a situation.
bool canUnpackingIntroduceDependencies(const MachineInstr &MI);
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
// this transformation.
void performF32Unpacking(MachineInstr &I);
- // Select corresponding unpacked instruction from packed instruction as I
+ // Select corresponding unpacked instruction
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
- // instruction
+ // instruction.
MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
bool IsHiBits);
- // process operands/source modifiers from packed instructions and insert the
- // appropriate source modifers and operands into the unpacked instructions
- void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods,
+ // Process operands/source modifiers from packed instructions and insert the
+ // appropriate source modifers and operands into the unpacked instructions.
+ void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
bool IsHiBits, const MachineOperand &SrcMO);
public:
@@ -455,7 +455,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
}
// If support is extended to new operations, add tests in
-// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
@@ -481,48 +481,47 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
// Such scenarios can arise due to specific combinations of op_sel and
// op_sel_hi modifiers.
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
- int Src0ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
- int Src1ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
- unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm();
- unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm();
-
- if (TII->getNamedOperand(MI, AMDGPU::OpName::src0)->isReg()) {
- Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg();
+ unsigned Src0Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
+ unsigned Src1Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
+
+ const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ if (Src0MO->isReg()) {
+ Register SrcReg0 = Src0MO->getReg();
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
// Check if the register selected by op_sel_hi is the same as the first
- // register in the destination register pair
- if (UnpackedDstReg == HiSrc0Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
+ // register in the destination register pair.
+ if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
return true;
}
- if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isReg()) {
- Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg();
+ const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Src1MO->isReg()) {
+ Register SrcReg1 = Src1MO->getReg();
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
- if (UnpackedDstReg == HiSrc1Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
+ if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
return true;
}
- // Applicable for packed instructions with 3 source operands, such as V_PK_FMA
+ // Applicable for packed instructions with 3 source operands, such as
+ // V_PK_FMA.
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- int Src2ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
- unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm();
- if (TII->getNamedOperand(MI, AMDGPU::OpName::src2)->isReg()) {
+ unsigned Src2Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
+ const MachineOperand *Src2MO =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2MO->isReg()) {
Register SrcReg2 =
TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
- if (UnpackedDstReg == HiSrc2Reg ||
- TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+ if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
return true;
}
}
@@ -533,7 +532,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
unsigned Opcode = I.getOpcode();
// Use 64 bit encoding to allow use of VOP3 instructions.
// VOP3 e64 instructions allow source modifiers
- // e32 instructions don't allow source modifiers
+ // e32 instructions don't allow source modifiers.
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
return AMDGPU::V_ADD_F32_e64;
@@ -547,7 +546,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
llvm_unreachable("Fully covered switch");
}
-void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
+void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
unsigned SrcMods, bool IsHiBits,
const MachineOperand &SrcMO) {
unsigned NewSrcMods = 0;
@@ -555,13 +554,13 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
// Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
// for ABS modifiers.
- // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
- // lane.
- // NEG_HI shares the same bit position with ABS. But packed instructions do
- // not support ABS. Therefore, NEG_HI must be translated to NEG source
- // modifier for the higher 32 bits. Unpacked VOP3 instructions support
- // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
- // NEG modifier if present in the packed instruction
+ // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
+ // lane.
+ // NEG_HI shares the same bit position with ABS. But packed instructions do
+ // not support ABS. Therefore, NEG_HI must be translated to NEG source
+ // modifier for the higher 32 bits. Unpacked VOP3 instructions support
+ // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
+ // NEG modifier if present in the packed instruction.
if (SrcMods & NegModifier)
NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
@@ -572,7 +571,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI,
NewMI.addImm(SrcMO.getImm());
return;
}
- // If op_sel == 0, select register 0 of reg:sub0_sub1
+ // If op_sel == 0, select register 0 of reg:sub0_sub1.
Register UnpackedSrcReg = (SrcMods & OpSelModifier)
? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
: TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
@@ -609,8 +608,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
auto E = BB->end();
int TotalCyclesBetweenCandidates = 0;
auto SchedModel = TII->getSchedModel();
- SetVector<Register> RegsOverlappedByMFMADef;
- RegsOverlappedByMFMADef.insert(BeginMI.getOperand(0).getReg());
+ Register MFMADef = BeginMI.getOperand(0).getReg();
+
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
if (Instr.isMetaInstruction())
@@ -619,6 +618,9 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
return;
if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr))
return;
+ if (SIInstrInfo::modifiesModeRegister(Instr) &&
+ Instr.modifiesRegister(AMDGPU::EXEC, TRI))
+ return;
const MCSchedClassDesc *InstrSchedClassDesc =
SchedModel.resolveSchedClass(&Instr);
TotalCyclesBetweenCandidates +=
@@ -633,14 +635,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// read/write registers.
for (const MachineOperand &InstrMO : Instr.operands()) {
if (InstrMO.isReg()) {
- for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) {
- if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) {
- if (isUnpackingSupportedInstr(Instr))
- return;
- RegsOverlappedByMFMADef.insert(Instr.getOperand(0).getReg());
- break;
- }
- }
+ if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
+ return;
}
}
if (isUnpackingSupportedInstr(Instr)) {
@@ -649,14 +645,15 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
return;
// If it is a packed instruction, we should subtract it's latency from the
// overall latency calculation here, because the packed instruction will
- // be removed and replaced by 2 unpacked instructions
+ // be removed and replaced by 2 unpacked instructions.
TotalCyclesBetweenCandidates -=
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
// We're adding 2 to account for the extra latency added by unpacking into
// 2 instructions. At the time of writing, the considered unpacked
// instructions have latency of 1.
- // TODO: improve latency handling of possible inserted instructions
+ // TODO: improve latency handling of possible inserted instructions.
TotalCyclesBetweenCandidates += 2;
+ // Subtract 1 to account for MFMA issue latency.
if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
InstrsToUnpack.insert(&Instr);
}
@@ -711,15 +708,11 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
: TRI->getSubReg(DstReg, AMDGPU::sub0);
- int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp);
- int64_t ClampVal = I.getOperand(ClampIdx).getImm();
- int Src0ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers);
- int Src1ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers);
-
- unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm();
- unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm();
+ int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
+ unsigned Src0Mods =
+ TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
+ unsigned Src1Mods =
+ TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
@@ -729,9 +722,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *SrcMO3 =
TII->getNamedOperand(I, AMDGPU::OpName::src2);
- int Src2ModifiersIdx =
- AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers);
- unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm();
+ unsigned Src2Mods =
+ TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
@@ -785,7 +777,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
// and limit the distance to 20 instructions for compile time purposes.
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
// may be bundled with the instructions they modify.
- //
for (auto &MI : make_early_inc_range(MBB.instrs())) {
if (Count == Threshold)
SetGPRMI = nullptr;
@@ -808,7 +799,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
}
}
- // TODO: fold this into previous block, if possible. Evaluate and handle any
+ // TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
@@ -827,9 +818,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
}
if (!InstrsToUnpack.empty()) {
for (MachineInstr *MI : InstrsToUnpack) {
- if (!SIInstrInfo::modifiesModeRegister(*MI) &&
- !MI->modifiesRegister(AMDGPU::EXEC, TRI))
- performF32Unpacking(*MI);
+ performF32Unpacking(*MI);
}
}
}
>From 022fa2d09a5aba6b96e33b191ea1e4d38dce2bbd Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 18 Sep 2025 10:20:48 -0500
Subject: [PATCH 13/16] new tests, reduced nesting, cleanup
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 108 ++++++++----------
...ck-non-coissue-insts-post-ra-scheduler.mir | 86 ++++++++++++++
2 files changed, 136 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index b2f7780e1e2f9..71f8721efec04 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -62,7 +62,7 @@ class SIPreEmitPeephole {
// v_fma_f32 v1, v0, v2, v2
// Here, we have overwritten v0 before we use it. This function checks if
// unpacking can lead to such a situation.
- bool canUnpackingIntroduceDependencies(const MachineInstr &MI);
+ bool canUnpackingClobberRegister(const MachineInstr &MI);
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
// this transformation.
@@ -469,7 +469,7 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
llvm_unreachable("Fully covered switch");
}
-bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
+bool SIPreEmitPeephole::canUnpackingClobberRegister(
const MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
@@ -481,14 +481,12 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
// Such scenarios can arise due to specific combinations of op_sel and
// op_sel_hi modifiers.
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
- unsigned Src0Mods =
- TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
- unsigned Src1Mods =
- TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
- if (Src0MO->isReg()) {
+ if (Src0MO && Src0MO->isReg()) {
Register SrcReg0 = Src0MO->getReg();
+ unsigned Src0Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
@@ -499,8 +497,10 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
}
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
- if (Src1MO->isReg()) {
+ if (Src1MO && Src1MO->isReg()) {
Register SrcReg1 = Src1MO->getReg();
+ unsigned Src1Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
@@ -511,13 +511,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
// Applicable for packed instructions with 3 source operands, such as
// V_PK_FMA.
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
- unsigned Src2Mods =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
const MachineOperand *Src2MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
- if (Src2MO->isReg()) {
+ if (Src2MO && Src2MO->isReg()) {
Register SrcReg2 =
TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ unsigned Src2Mods =
+ TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
@@ -614,19 +614,19 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
MachineInstr &Instr = *I;
if (Instr.isMetaInstruction())
continue;
- if (Instr.isTerminator())
- return;
- if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr))
- return;
- if (SIInstrInfo::modifiesModeRegister(Instr) &&
- Instr.modifiesRegister(AMDGPU::EXEC, TRI))
+ if ((Instr.isTerminator()) ||
+ (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+ (SIInstrInfo::modifiesModeRegister(Instr) &&
+ Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
+
const MCSchedClassDesc *InstrSchedClassDesc =
SchedModel.resolveSchedClass(&Instr);
- TotalCyclesBetweenCandidates +=
+ uint16_t Latency =
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
+ TotalCyclesBetweenCandidates += Latency;
- if (TotalCyclesBetweenCandidates > NumMFMACycles)
+ if (TotalCyclesBetweenCandidates > NumMFMACycles - 1)
return;
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Also checks for
@@ -634,29 +634,26 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// def and uses. Conservatively ensures that we do not incorrectly
// read/write registers.
for (const MachineOperand &InstrMO : Instr.operands()) {
- if (InstrMO.isReg()) {
- if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
- return;
- }
- }
- if (isUnpackingSupportedInstr(Instr)) {
- assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
- if (canUnpackingIntroduceDependencies(Instr))
+ if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
+ continue;
+ if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
- // If it is a packed instruction, we should subtract it's latency from the
- // overall latency calculation here, because the packed instruction will
- // be removed and replaced by 2 unpacked instructions.
- TotalCyclesBetweenCandidates -=
- SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
- // We're adding 2 to account for the extra latency added by unpacking into
- // 2 instructions. At the time of writing, the considered unpacked
- // instructions have latency of 1.
- // TODO: improve latency handling of possible inserted instructions.
- TotalCyclesBetweenCandidates += 2;
- // Subtract 1 to account for MFMA issue latency.
- if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
- InstrsToUnpack.insert(&Instr);
}
+ if (!isUnpackingSupportedInstr(Instr))
+ continue;
+
+ assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
+ if (canUnpackingClobberRegister(Instr))
+ return;
+ // If it's a packed instruction, adjust latency: remove the packed
+ // latency, add latency of two unpacked instructions (currently estimated
+ // as 2 cycles).
+ TotalCyclesBetweenCandidates -= Latency;
+ // TODO: improve latency handling based on instruction modeling.
+ TotalCyclesBetweenCandidates += 2;
+ // Subtract 1 to account for MFMA issue latency.
+ if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
+ InstrsToUnpack.insert(&Instr);
}
return;
}
@@ -672,8 +669,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
- if (DstOp.isUndef())
- LoDstOp.setIsUndef();
+ LoDstOp.setIsUndef(DstOp.isUndef());
MachineInstrBuilder Op0HOp1H =
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
@@ -687,10 +683,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract);
Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract);
}
- if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) {
- LoDstOp.setIsRenamable(true);
- HiDstOp.setIsRenamable(true);
- }
+
+ LoDstOp.setIsRenamable(DstOp.isRenamable());
+ HiDstOp.setIsRenamable(DstOp.isRenamable());
I.eraseFromParent();
return;
@@ -804,22 +799,19 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
// to co-issue unpacked instructions with MFMA
- uint16_t NumMFMACycles = 0;
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
- if (SIInstrInfo::isMFMA(MI)) {
- const MCSchedClassDesc *SchedClassDesc =
- SchedModel.resolveSchedClass(&MI);
- NumMFMACycles =
- SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
- collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
- }
+ if (!SIInstrInfo::isMFMA(MI))
+ continue;
+ const MCSchedClassDesc *SchedClassDesc =
+ SchedModel.resolveSchedClass(&MI);
+ uint16_t NumMFMACycles =
+ SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+ collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
}
- if (!InstrsToUnpack.empty()) {
- for (MachineInstr *MI : InstrsToUnpack) {
- performF32Unpacking(*MI);
- }
+ for (MachineInstr *MI : InstrsToUnpack) {
+ performF32Unpacking(*MI);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 6a9b406e390c0..054ca918fdb0f 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -481,3 +481,89 @@ body: |
$vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
+
+...
+---
+name: test_mfma_def_using_instr_blocks_unpacking
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_unpacking_with_imm_input
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+
+...
+---
+name: test_neg_lo_hi_post_unpacking
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$sgpr4_sgpr5' }
+
+body: |
+ bb.0.entry:
+ liveins: $sgpr4_sgpr5
+ early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ S_WAITCNT 49279
+ renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ S_WAITCNT 49279
+ $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
\ No newline at end of file
>From 720468f8cd1dfd6a1ee6b782a0f5d3a4ad6f7b29 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 18 Sep 2025 12:29:48 -0500
Subject: [PATCH 14/16] update tests, update unpacking selection condition
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 3 +-
...ck-non-coissue-insts-post-ra-scheduler.mir | 1014 +++++++++++++----
2 files changed, 811 insertions(+), 206 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 71f8721efec04..fb89248cb5919 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -458,6 +458,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
+ if (!TII->isNeverCoissue(MI))
+ return false;
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
case AMDGPU::V_PK_MUL_F32:
@@ -642,7 +644,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (!isUnpackingSupportedInstr(Instr))
continue;
- assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued.");
if (canUnpackingClobberRegister(Instr))
return;
// If it's a packed instruction, adjust latency: remove the packed
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 054ca918fdb0f..8b467eb0b054e 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1,5 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s
+# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s
---
name: test_pk_mul_unpacking_f32
@@ -11,27 +13,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_pk_mul_unpacking_f32
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_pk_mul_unpacking_f32
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_pk_mul_unpacking_f32
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -61,27 +106,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -111,27 +199,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: S_WAITCNT 49279
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -161,44 +292,119 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_pk_add_unpacking_f32
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
- ; GCN-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
- ; GCN-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
- ; GCN-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
- ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
- ; GCN-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
- ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_pk_add_unpacking_f32
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ ; GFX950-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_pk_add_unpacking_f32
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX942-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_pk_add_unpacking_f32
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+ ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
+ ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec
renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec
@@ -244,25 +450,64 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_pk_fma_unpacking_f32
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_pk_fma_unpacking_f32
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_pk_fma_unpacking_f32
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -290,26 +535,67 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -338,28 +624,74 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -394,27 +726,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
- ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -444,26 +819,68 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
- ; GCN-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
- ; GCN: liveins: $sgpr4_sgpr5
- ; GCN-NEXT: {{ $}}
- ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
- ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
- ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
- ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
- ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
- ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
- ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
- ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
- ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
- ; GCN-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
- ; GCN-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
- ; GCN-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
- ; GCN-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: S_ENDPGM 0
+ ; GFX950-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ ; GFX950-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
+ ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec
+ ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -493,6 +910,65 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
+ ; GFX950-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
@@ -521,6 +997,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
+ ; GFX950-LABEL: name: test_unpacking_with_imm_input
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_unpacking_with_imm_input
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_unpacking_with_imm_input
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -550,6 +1090,70 @@ liveins:
body: |
bb.0.entry:
liveins: $sgpr4_sgpr5
+ ; GFX950-LABEL: name: test_neg_lo_hi_post_unpacking
+ ; GFX950: liveins: $sgpr4_sgpr5
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX950-NEXT: S_WAITCNT 49279
+ ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_neg_lo_hi_post_unpacking
+ ; GFX942: liveins: $sgpr4_sgpr5
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX942-NEXT: S_WAITCNT 49279
+ ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking
+ ; GFX90a: liveins: $sgpr4_sgpr5
+ ; GFX90a-NEXT: {{ $}}
+ ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+ ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+ ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+ ; GFX90a-NEXT: S_WAITCNT 49279
+ ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+ ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec
+ ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51
+ ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
+ ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
+ ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90a-NEXT: S_ENDPGM 0
early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
S_WAITCNT 49279
@@ -566,4 +1170,4 @@ body: |
$vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- S_ENDPGM 0
\ No newline at end of file
+ S_ENDPGM 0
>From 4685911d43555f9629091ed294e9c0dce78f6f60 Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 18 Sep 2025 15:09:52 -0500
Subject: [PATCH 15/16] minor fixes
---
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index fb89248cb5919..e71f5ed1cdaa0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -457,9 +457,9 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
// If support is extended to new operations, add tests in
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
- unsigned Opcode = MI.getOpcode();
if (!TII->isNeverCoissue(MI))
return false;
+ unsigned Opcode = MI.getOpcode();
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
case AMDGPU::V_PK_MUL_F32:
@@ -516,8 +516,7 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(
const MachineOperand *Src2MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2MO && Src2MO->isReg()) {
- Register SrcReg2 =
- TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
+ Register SrcReg2 = Src2MO->getReg();
unsigned Src2Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
@@ -628,7 +627,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
TotalCyclesBetweenCandidates += Latency;
- if (TotalCyclesBetweenCandidates > NumMFMACycles - 1)
+ if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
return;
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Also checks for
@@ -663,8 +662,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
MachineOperand DstOp = I.getOperand(0);
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
- if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
- return;
+ assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && "Unsupported Opcode");
MachineInstrBuilder Op0LOp1L =
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
>From 3e016ca3ee60cb6fffccffc519be885412d65d5a Mon Sep 17 00:00:00 2001
From: Akash Dutta <Akash.Dutta at amd.com>
Date: Thu, 18 Sep 2025 16:12:38 -0500
Subject: [PATCH 16/16] update failing test
---
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 24 ++++++++++++-------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index aad6e031aa9ed..b07dec326327e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -536,9 +536,12 @@
; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
- ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48
+ ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48
+ ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48
+ ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48
+ ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48
+ ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48
; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
@@ -547,9 +550,12 @@
; GCN-NEXT: v_exp_f32_e32 v58, v58
; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
- ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48
+ ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48
+ ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48
+ ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48
+ ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48
+ ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48
; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
@@ -561,8 +567,10 @@
; GCN-NEXT: v_exp_f32_e32 v59, v57
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
- ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48
+ ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48
+ ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48
+ ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48
; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
More information about the llvm-commits
mailing list