[llvm] [AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp interleaving (PR #80370)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 6 04:05:18 PST 2024
================
@@ -904,6 +909,854 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
}
}
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+ /// Whether or not the instruction is a transitive predecessor of an MFMA
+ /// instruction
+ class IsPipeExp final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+ auto DAG = SyncPipe[0].DAG;
+ auto TII = SyncPipe[0].TII;
+
+ if (Cache->empty()) {
+ auto I = DAG->SUnits.rbegin();
+ auto E = DAG->SUnits.rend();
+ for (; I != E; I++) {
+ if (TII->isMFMAorWMMA(*(I->getInstr())))
+ Cache->push_back(&*I);
+ }
+ }
+
+ if (Cache->empty())
+ return false;
+
+ auto Reaches = (std::any_of(
+ Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+ return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+ }));
+
+ return Reaches;
+ }
+ IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the insturction is a transitive predecessor of the same
+ /// MFMA instruction as an instruction in a SchedGroup \p Number steps before
+ class ProduceSameMFMAWithPrevN final : public InstructionRule {
+ private:
+ unsigned Number = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Number) {
+ OtherGroup = &PipeSG;
+ }
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ auto DAG = SyncPipe[0].DAG;
+
+ if (Cache->empty()) {
+ auto TII = SyncPipe[0].TII;
+ SmallVector<SUnit *, 8> Worklist;
+
+ auto I = DAG->SUnits.rbegin();
+ auto E = DAG->SUnits.rend();
+ for (; I != E; I++)
+ if (TII->isMFMAorWMMA(*(I->getInstr())))
+ Worklist.push_back(&*I);
+
+ for (auto BaseSU : OtherGroup->Collection) {
+ if (!Cache->empty())
+ break;
+ for (auto CandSU : Worklist) {
+ if (DAG->IsReachable(CandSU, BaseSU)) {
+ Cache->push_back(CandSU);
+ break;
+ }
+ }
+ }
+ }
+ if (Cache->empty())
+ return false;
+
+ return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+ }
+
+ ProduceSameMFMAWithPrevN(unsigned Number, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+ };
+
+ /// Whether or not the instruction has less than \p Size immediate successors
+ class LessThanNSuccs final : public InstructionRule {
+ private:
+ unsigned Size = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SyncPipe.size())
+ return false;
+
+ return SU->Succs.size() < Size;
+ }
+ LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Size(Size) {}
+ };
+
+ // Whether or not the instruction is an V_CVT instruction.
+ class IsCvt final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto Opc = SU->getInstr()->getOpcode();
+ return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+ Opc == AMDGPU::V_CVT_F16_F32_e32_gfx10 ||
+ Opc == AMDGPU::V_CVT_I32_F32_e32 ||
+ Opc == AMDGPU::V_CVT_I32_F32_e32_gfx10 ||
+ Opc == AMDGPU::V_CVT_I32_F32_e32_gfx11;
+ }
+ IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether or not the instruction is an V_FMA_F32 instruction.
+ class IsFMA final : public InstructionRule {
+ private:
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64;
+ }
+ IsFMA(unsigned Val, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ /// Whether or not the instruction is an immediate RAW successor
+ /// of the SchedGroup \p Distance steps before.
+ class IsSuccOfPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+ OtherGroup = &PipeSG;
+ }
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ for (auto &OtherEle : OtherGroup->Collection) {
+ for (auto &Succ : OtherEle->Succs) {
+ if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+ return true;
+ }
+ }
+
+ return false;
+ }
+ IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction is a transitive successor of any
+ /// instruction the the SchedGroup \p Distance steps before.
+ class IsReachableFromPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+ OtherGroup = &PipeSG;
+ }
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ auto DAG = SyncPipe[0].DAG;
+
+ for (auto &OtherEle : OtherGroup->Collection)
+ if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+ return true;
+
+ return false;
+ }
+ IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+ /// Whether or not the instruction is the \p Number th occuring DS_READ
+ /// instruciton
+ class IsNthDSR final : public InstructionRule {
----------------
arsenm wrote:
DSR isn't clear, DS load?
https://github.com/llvm/llvm-project/pull/80370
More information about the llvm-commits
mailing list