[llvm] [AMDGPU] Add iglp_opt(2) to provide initial MFMA/Exp interleaving (PR #80370)

Thu Feb 8 14:25:33 PST 2024

================
@@ -904,6 +909,854 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
   }
 }
 
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+  /// Whether or not the instruction is a transitive predecessor of an MFMA
+  /// instruction
+  class IsPipeExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+      auto TII = SyncPipe[0].TII;
+
+      if (Cache->empty()) {
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++) {
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Cache->push_back(&*I);
+        }
+      }
+
+      if (Cache->empty())
+        return false;
+
+      auto Reaches = (std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+          }));
+
+      return Reaches;
+    }
+    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the insturction is a transitive predecessor of the same
+  /// MFMA instruction as an instruction in a SchedGroup \p Number steps before
+  class ProduceSameMFMAWithPrevN final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Number) {
+          OtherGroup = &PipeSG;
+        }
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        auto TII = SyncPipe[0].TII;
+        SmallVector<SUnit *, 8> Worklist;
+
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++)
+          if (TII->isMFMAorWMMA(*(I->getInstr())))
+            Worklist.push_back(&*I);
+
+        for (auto BaseSU : OtherGroup->Collection) {
+          if (!Cache->empty())
+            break;
+          for (auto CandSU : Worklist) {
+            if (DAG->IsReachable(CandSU, BaseSU)) {
+              Cache->push_back(CandSU);
+              break;
+            }
+          }
+        }
+      }
+      if (Cache->empty())
+        return false;
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    ProduceSameMFMAWithPrevN(unsigned Number, const SIInstrInfo *TII,
+                             unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the instruction has less than \p Size immediate successors
+  class LessThanNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      return SU->Succs.size() < Size;
+    }
+    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size) {}
+  };
+
+  // Whether or not the instruction is an V_CVT instruction.
+  class IsCvt final : public InstructionRule {
+  private:
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto Opc = SU->getInstr()->getOpcode();
+      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+             Opc == AMDGPU::V_CVT_F16_F32_e32_gfx10 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32_gfx10 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32_gfx11;
+    }
+    IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is an V_FMA_F32 instruction.
+  class IsFMA final : public InstructionRule {
----------------
jrbyrnes wrote:

The per item flow is:
	fma_f32 -> exp_f32 -> cvt_f16_f32 -> v_pack_b32_f16 -> mfma_.._f16

Some variants have lowered into v_perm, and other may have some arithmetic between the exp and cvt, but this basic structure is universal from what I've seen. The main parameters (in terms of DAG structure) are number mfmas & exps, mfma type / how many exp feed into a single mfma.

Are you asking why we haven't instead produced fma_f16 -> exp_f16 -> v_pack ? This would bypass the costly cvt..

As it is, though, by adding fma into the pipeline derived from the mutation, we are able to interleave (as is the main function of pipelining). However, the main benefit is actually reduced RP and better postRA scheduling.

https://github.com/llvm/llvm-project/pull/80370