[llvm] [AMDGPU] Optionally use the downcasted version for SchedGroups (PR #164024)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 17 15:32:46 PDT 2025
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/164024
Adds an option to use the unpack sequence of instructions when making SchedGroup assignment decisions.
To facilitate this, this also adds `TII->getDowncastSequence` -- this is meant to force agreement between the unpacker and scheduler related things on how the unpacking will be done.
>From 19decad1b4fc681772386e0d25e31160e9e689f8 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 16 Oct 2025 14:49:26 -0700
Subject: [PATCH] [AMDGPU] Optionally use the downcasted version for
SchedGroups
Change-Id: Iffc6b6309ba050f139298d88c1dbdb9ab0fe1fd3
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 156 ++++++++---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 35 +++
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +
llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 53 +++-
.../CodeGen/AMDGPU/sched.group.downcast.mir | 244 ++++++++++++++++++
5 files changed, 451 insertions(+), 41 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468e2420e..a1a9b2b7162ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -60,6 +60,17 @@ static cl::opt<bool> UseCostHeur(
"Experimentally, results are mixed, so this should be set on a "
"case-by-case basis."));
+static cl::opt<bool> UseDowncastOps(
+ "amdgpu-igrouplp-use-downcast-ops", cl::Hidden,
+ cl::desc("Whether to use the downcast alternative OpCodes instead of the "
+ "current OpCode. Under certain conditions, some OpCodes may be "
+ "downcast "
+ "to an alternative sequence after scheduling (e.g. V_PK_MUL_F32 "
+ "-> V_MUL_F32). "
+ "This flag enables SchedGroup classification based on the "
+ "alternative."),
+ cl::init(false));
+
// Components of the mask that determines which instruction types may be may be
// classified into a SchedGroup.
enum class SchedGroupMask {
@@ -133,6 +144,8 @@ class SchedGroup {
// SGID is used to map instructions to candidate SchedGroups
unsigned SGID;
+ unsigned CurrentSize = 0;
+
// The different rules each instruction in this SchedGroup must conform to
SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
@@ -143,9 +156,14 @@ class SchedGroup {
bool tryAddEdge(SUnit *A, SUnit *B);
// Use SGMask to determine whether we can classify MI as a member of this
- // SchedGroup object.
+ // SchedGroup object. If UseDowncastOps is specified, and this is a candidate
+ // for downcasting, then use the DownCasted OpCodes.
bool canAddMI(const MachineInstr &MI) const;
+ // Use SGMask to determine whether we can classify an opcode as a member of
+ // this SchedGroup object.
+ bool canAddSingleMI(unsigned Opcode, bool MayLoad, bool MayStore) const;
+
public:
// Collection of SUnits that are classified as members of this group.
SmallVector<SUnit *, 32> Collection;
@@ -176,7 +194,7 @@ class SchedGroup {
void link(SchedGroup &OtherGroup);
// Returns true if no more instructions may be added to this group.
- bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
+ bool isFull() const { return MaxSize && CurrentSize >= *MaxSize; }
// Append a constraint that SUs must meet in order to fit into this
// SchedGroup. Since many rules involve the relationship between a SchedGroup
@@ -202,10 +220,55 @@ class SchedGroup {
<< format_hex((int)SGMask, 10, true) << " adding "
<< *SU.getInstr());
Collection.push_back(&SU);
+ MachineInstr &MI = *SU.getInstr();
+ if (!UseDowncastOps || MI.isMetaInstruction()) {
+ ++CurrentSize;
+ return;
+ }
+
+ SmallVector<unsigned, 4> UnpackSequence;
+ if (!TII->getDowncastSequence(MI, UnpackSequence,
+ DAG->MF.getSubtarget<GCNSubtarget>())) {
+ ++CurrentSize;
+ return;
+ }
+
+ for (unsigned UnpackOp : UnpackSequence) {
+ if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore()))
+ ++CurrentSize;
+ }
}
// Remove last element in the SchedGroup
- void pop() { Collection.pop_back(); }
+ void pop() {
+ SUnit *SU = Collection.pop_back_val();
+ MachineInstr &MI = *SU->getInstr();
+ if (!UseDowncastOps || MI.isMetaInstruction()) {
+ assert(CurrentSize >= 1);
+ --CurrentSize;
+ return;
+ }
+
+ SmallVector<unsigned, 4> UnpackSequence;
+ if (!TII->getDowncastSequence(MI, UnpackSequence,
+ DAG->MF.getSubtarget<GCNSubtarget>())) {
+ assert(CurrentSize >= 1);
+ --CurrentSize;
+ return;
+ }
+
+ for (unsigned UnpackOp : UnpackSequence) {
+ if (canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore())) {
+ assert(CurrentSize >= 1);
+ --CurrentSize;
+ }
+ }
+ }
+
+ void clear() {
+ Collection.clear();
+ CurrentSize = 0;
+ }
// Identify and add all relevant SUs from the DAG to this SchedGroup.
void initSchedGroup();
@@ -371,16 +434,16 @@ class PipelineSolver {
};
void PipelineSolver::reset() {
-
for (auto &SyncPipeline : CurrPipeline) {
for (auto &SG : SyncPipeline) {
SmallVector<SUnit *, 32> TempCollection = SG.Collection;
- SG.Collection.clear();
+ SG.clear();
auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
});
- if (SchedBarr != TempCollection.end())
- SG.Collection.push_back(*SchedBarr);
+ if (SchedBarr != TempCollection.end()) {
+ SG.add(**SchedBarr);
+ }
}
}
@@ -2386,64 +2449,99 @@ bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
return false;
}
-bool SchedGroup::canAddMI(const MachineInstr &MI) const {
+bool SchedGroup::canAddSingleMI(unsigned Opcode, bool MayLoad,
+ bool MayStore) const {
bool Result = false;
- if (MI.isMetaInstruction())
- Result = false;
- else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
- (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
- TII->isTRANS(MI)))
- Result = !MI.mayLoadOrStore();
+ if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
+ (TII->isVALU(Opcode) || TII->isMFMAorWMMA(Opcode) ||
+ TII->isSALU(Opcode) || TII->isTRANS(Opcode)))
+ Result = !(MayLoad || MayStore);
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+ TII->isVALU(Opcode) && !TII->isMFMAorWMMA(Opcode) &&
+ !TII->isTRANS(Opcode)) {
// Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
// For our purposes, these shall not be classified as VALU as this results
// in unexpected behavior.
- Result = !MI.mayLoadOrStore();
+ Result = !(MayLoad || MayStore);
}
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
- TII->isSALU(MI))
- Result = !MI.mayLoadOrStore();
+ TII->isSALU(Opcode))
+ Result = !(MayLoad || MayStore);
else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
- TII->isMFMAorWMMA(MI))
+ TII->isMFMAorWMMA(Opcode))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
- TII->isVMEM(MI))
+ (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
- MI.mayLoad() && TII->isVMEM(MI))
+ MayLoad && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
Result = true;
else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
- MI.mayStore() && TII->isVMEM(MI))
+ MayStore && (TII->isVMEM(Opcode) || TII->isFLAT(Opcode)))
Result = true;
else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
- TII->isDS(MI))
+ TII->isDS(Opcode))
Result = true;
else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
- MI.mayLoad() && TII->isDS(MI))
+ MayLoad && TII->isDS(Opcode))
Result = true;
else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
- MI.mayStore() && TII->isDS(MI))
+ MayStore && TII->isDS(Opcode))
Result = true;
else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
- TII->isTRANS(MI))
+ TII->isTRANS(Opcode))
Result = true;
- LLVM_DEBUG(
- dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
- << (Result ? " could classify " : " unable to classify ") << MI);
+ return Result;
+}
+
+bool SchedGroup::canAddMI(const MachineInstr &MI) const {
+ bool Result = false;
+
+ auto emitDebug = [this](const MachineInstr &MI, bool Result) {
+ LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
+ << format_hex((int)SGMask, 10, true)
+ << (Result ? " could classify " : " unable to classify ")
+ << MI);
+ };
+
+ if (MI.isMetaInstruction()) {
+ emitDebug(MI, false);
+ return false;
+ }
+
+ if (!UseDowncastOps) {
+ Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore());
+ emitDebug(MI, Result);
+ return Result;
+ }
+
+ SmallVector<unsigned, 4> UnpackSequence;
+ if (!TII->getDowncastSequence(MI, UnpackSequence,
+ DAG->MF.getSubtarget<GCNSubtarget>())) {
+ Result = canAddSingleMI(MI.getOpcode(), MI.mayLoad(), MI.mayStore());
+ emitDebug(MI, Result);
+ return Result;
+ }
+
+ // We have an unpackable MI, check if the unpack OpCodes are classifiable by
+ // this mask.
+ for (unsigned UnpackOp : UnpackSequence) {
+ Result |= canAddSingleMI(UnpackOp, MI.mayLoad(), MI.mayStore());
+ }
+ emitDebug(MI, Result);
return Result;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 50447f48a628c..17f5789afdd4c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6366,6 +6366,41 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return isImmOperandLegal(MI, OpIdx, *MO);
}
+bool SIInstrInfo::getDowncastSequence(const MachineInstr &MI,
+ SmallVectorImpl<unsigned> &Sequence,
+ const GCNSubtarget &ST) const {
+ bool isGFX940Plus = ST.hasGFX940Insts();
+ switch (MI.getOpcode()) {
+ // Use 64 bit encoding to allow use of VOP3 instructions.
+ // VOP3 e64 instructions allow source modifiers
+ // e32 instructions don't allow source modifiers.
+ case AMDGPU::V_PK_ADD_F32: {
+ if (!isGFX940Plus)
+ return false;
+ Sequence.push_back(AMDGPU::V_ADD_F32_e64);
+ Sequence.push_back(AMDGPU::V_ADD_F32_e64);
+ return true;
+ }
+ case AMDGPU::V_PK_MUL_F32: {
+ if (!isGFX940Plus)
+ return false;
+ Sequence.push_back(AMDGPU::V_MUL_F32_e64);
+ Sequence.push_back(AMDGPU::V_MUL_F32_e64);
+ return true;
+ }
+ case AMDGPU::V_PK_FMA_F32: {
+ if (!isGFX940Plus)
+ return false;
+ Sequence.push_back(AMDGPU::V_FMA_F32_e64);
+ Sequence.push_back(AMDGPU::V_FMA_F32_e64);
+ return true;
+ }
+ default:
+ return false;
+ }
+ llvm_unreachable("Fully covered switch");
+}
+
bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
bool IsGFX950Only = ST.hasGFX950Insts();
bool IsGFX940Only = ST.hasGFX940Insts();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index df27ec1f8de8c..e51f3b996e250 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1237,6 +1237,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
bool isNeverCoissue(MachineInstr &MI) const;
+ bool getDowncastSequence(const MachineInstr &MI,
+ SmallVectorImpl<unsigned> &Sequence,
+ const GCNSubtarget &ST) const;
+
/// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO.
bool isLegalAV64PseudoImm(uint64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..b06c3f0a89399 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -51,7 +51,8 @@ class SIPreEmitPeephole {
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
SetVector<MachineInstr *> &InstrsToUnpack,
- uint16_t NumMFMACycles);
+ uint16_t NumMFMACycles,
+ const GCNSubtarget &ST);
// v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
// op_sel_hi:[0,0,0]
// ==>
@@ -63,7 +64,7 @@ class SIPreEmitPeephole {
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
// this transformation.
- void performF32Unpacking(MachineInstr &I);
+ void performF32Unpacking(MachineInstr &I, const GCNSubtarget &ST);
// Select corresponding unpacked instruction
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
@@ -583,20 +584,33 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
void SIPreEmitPeephole::collectUnpackingCandidates(
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
- uint16_t NumMFMACycles) {
+ uint16_t NumMFMACycles, const GCNSubtarget &ST) {
auto *BB = BeginMI.getParent();
auto E = BB->end();
int TotalCyclesBetweenCandidates = 0;
auto SchedModel = TII->getSchedModel();
+ const MCSchedModel *MCSchedMod = SchedModel.getMCSchedModel();
Register MFMADef = BeginMI.getOperand(0).getReg();
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
- uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
- bool IsUnpackable =
- !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
if (Instr.isMetaInstruction())
continue;
+
+ SmallVector<unsigned, 4> UnpackSequence;
+ bool IsUnpackable = TII->getDowncastSequence(Instr, UnpackSequence, ST);
+
+ // We only support unpacking where the unpack sequence is all the same
+ // opcode. To support more complex sequences we must teach
+ // performF32Unpacking how to handle them. The unpack sequence used in
+ // performF32Unpacking must agree with with TII->getDowncastSequence, as
+ // this method is used for some scheduling decisions, under the assumption
+ // that this will be sequence used for unpacking.
+ IsUnpackable &=
+ all_of(UnpackSequence, [&UnpackSequence](unsigned CurrentOpcode) {
+ return CurrentOpcode == UnpackSequence[0];
+ });
+
if ((Instr.isTerminator()) ||
(TII->isNeverCoissue(Instr) && !IsUnpackable) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
@@ -631,18 +645,33 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// latency, add latency of two unpacked instructions (currently estimated
// as 2 cycles).
TotalCyclesBetweenCandidates -= Latency;
- // TODO: improve latency handling based on instruction modeling.
- TotalCyclesBetweenCandidates += 2;
+
+ for (unsigned Opcode : UnpackSequence) {
+ unsigned SchedClass = TII->get(Opcode).getSchedClass();
+ const MCSchedClassDesc *SCDesc =
+ MCSchedMod->getSchedClassDesc(SchedClass);
+
+ // FIXME: We don't have an opcode based SchedClass resolution for variant
+ // SchedClass. This is a non-issue currently as none of the unpack
+ // instructions have variant SchedClasses.
+ assert(!SCDesc->isVariant());
+ uint16_t Latency =
+ SchedModel.getWriteProcResBegin(SCDesc)->ReleaseAtCycle;
+ TotalCyclesBetweenCandidates += Latency;
+ }
// Subtract 1 to account for MFMA issue latency.
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
InstrsToUnpack.insert(&Instr);
}
}
-void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
+void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I,
+ const GCNSubtarget &ST) {
MachineOperand DstOp = I.getOperand(0);
- uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
+ SmallVector<unsigned, 4> UnpackSequence;
+ TII->getDowncastSequence(I, UnpackSequence, ST);
+ uint16_t UnpackedOpcode = UnpackSequence[0];
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
"Unsupported Opcode");
@@ -786,10 +815,10 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
SchedModel.resolveSchedClass(&MI);
uint16_t NumMFMACycles =
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
- collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+ collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles, ST);
}
for (MachineInstr *MI : InstrsToUnpack) {
- performF32Unpacking(*MI);
+ performF32Unpacking(*MI, ST);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir
new file mode 100644
index 0000000000000..5f16e7ddfd090
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.downcast.mir
@@ -0,0 +1,244 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DEFAULT,GCN
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-igrouplp-use-downcast-ops=1 -run-pass=machine-scheduler -o - %s | FileCheck %s -check-prefixes=DOWNCAST,GCN
+
+
+# default will result in the prescribed pipeline, since amdgpu-igrouplp-use-downcast-ops thinks there are 8 VALU.
+
+---
+name: 2xVALU_1xSALU_2xVALU_1xSALU
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; DEFAULT-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU
+ ; DEFAULT: liveins: $vgpr0_vgpr1
+ ; DEFAULT-NEXT: {{ $}}
+ ; DEFAULT-NEXT: $exec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DEFAULT-NEXT: S_ENDPGM 0
+ ;
+ ; DOWNCAST-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU
+ ; DOWNCAST: liveins: $vgpr0_vgpr1
+ ; DOWNCAST-NEXT: {{ $}}
+ ; DOWNCAST-NEXT: $exec = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DOWNCAST-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %3, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vreg_64_align2 = V_PK_ADD_F32 8, %3, 8, %4, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %6:vreg_64_align2 = V_PK_ADD_F32 8, %4, 8, %5, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %7:vreg_64_align2 = V_PK_ADD_F32 8, %5, 8, %6, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %8:sgpr_32 = IMPLICIT_DEF
+ %9:sgpr_32 = IMPLICIT_DEF
+ %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc
+ %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ S_ENDPGM 0
+...
+
+# amdgpu-igrouplp-use-downcast-ops should have no effect since the ops aren't candidates for downcast
+
+---
+name: 2xVALU_1xSALU_2xVALU_1xSALU_nonunpack
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GCN-LABEL: name: 2xVALU_1xSALU_2xVALU_1xSALU_nonunpack
+ ; GCN: liveins: $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $exec = IMPLICIT_DEF
+ ; GCN-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF2]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF3]], 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: dead [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_ADD_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; GCN-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = V_ADD_F32_e64 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+ %5:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %4, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_32 = V_ADD_F32_e64 0, %4, 0, %5, 0, 0, implicit $mode, implicit $exec
+ %7:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+ %8:sgpr_32 = IMPLICIT_DEF
+ %9:sgpr_32 = IMPLICIT_DEF
+ %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc
+ %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ S_ENDPGM 0
+...
+
+# amdgpu-igrouplp-use-downcast-ops should schedule 2 v_pk between the SALU, since v_pk will be unpacked.
+
+---
+name: 4xVALU_1xSALU_4xVALU_1xSALU
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; DEFAULT-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU
+ ; DEFAULT: liveins: $vgpr0_vgpr1
+ ; DEFAULT-NEXT: {{ $}}
+ ; DEFAULT-NEXT: $exec = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; DEFAULT-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; DEFAULT-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DEFAULT-NEXT: S_ENDPGM 0
+ ;
+ ; DOWNCAST-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU
+ ; DOWNCAST: liveins: $vgpr0_vgpr1
+ ; DOWNCAST-NEXT: {{ $}}
+ ; DOWNCAST-NEXT: $exec = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF2:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF3:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF2]], 8, [[DEF3]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[DEF3]], 8, [[V_PK_ADD_F32_]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; DOWNCAST-NEXT: [[V_PK_ADD_F32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_]], 8, [[V_PK_ADD_F32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: dead [[V_PK_ADD_F32_3:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_ADD_F32_1]], 8, [[V_PK_ADD_F32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DOWNCAST-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; DOWNCAST-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; DOWNCAST-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = IMPLICIT_DEF
+ %4:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %3, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %5:vreg_64_align2 = V_PK_ADD_F32 8, %3, 8, %4, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %6:vreg_64_align2 = V_PK_ADD_F32 8, %4, 8, %5, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %7:vreg_64_align2 = V_PK_ADD_F32 8, %5, 8, %6, 11, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %8:sgpr_32 = IMPLICIT_DEF
+ %9:sgpr_32 = IMPLICIT_DEF
+ %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc
+ %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc
+ SCHED_GROUP_BARRIER 2, 4, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 4, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ S_ENDPGM 0
+...
+
+# amdgpu-igrouplp-use-downcast-ops should have no effect since the ops aren't candidates for downcast
+
+---
+name: 4xVALU_1xSALU_4xVALU_1xSALU_nonunpack
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; GCN-LABEL: name: 4xVALU_1xSALU_4xVALU_1xSALU_nonunpack
+ ; GCN: liveins: $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $exec = IMPLICIT_DEF
+ ; GCN-NEXT: dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: dead [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF2]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[DEF3]], 0, [[V_ADD_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: dead [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_ADD_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF4]], [[DEF5]], implicit-def $scc
+ ; GCN-NEXT: dead [[S_ADD_U32_1:%[0-9]+]]:sgpr_32 = S_ADD_U32 [[DEF5]], [[S_ADD_U32_]], implicit-def $scc
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; GCN-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; GCN-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = V_ADD_F32_e64 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+ %5:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %4, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_32 = V_ADD_F32_e64 0, %4, 0, %5, 0, 0, implicit $mode, implicit $exec
+ %7:vgpr_32 = V_ADD_F32_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+ %8:sgpr_32 = IMPLICIT_DEF
+ %9:sgpr_32 = IMPLICIT_DEF
+ %10:sgpr_32 = S_ADD_U32 %8, %9, implicit-def $scc
+ %11:sgpr_32 = S_ADD_U32 %9, %10, implicit-def $scc
+ SCHED_GROUP_BARRIER 2, 4, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 4, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list