[llvm] 208fc34 - [RISCV] Improve SiFive7 for reductions and ordered reductions

Thu Jun 22 10:16:14 PDT 2023

Author: Michael Maitland
Date: 2023-06-22T10:16:04-07:00
New Revision: 208fc34c65d648e869d7d3ba0dfcbca90942cda0

URL: https://github.com/llvm/llvm-project/commit/208fc34c65d648e869d7d3ba0dfcbca90942cda0
DIFF: https://github.com/llvm/llvm-project/commit/208fc34c65d648e869d7d3ba0dfcbca90942cda0.diff

LOG: [RISCV] Improve SiFive7 for reductions and ordered reductions

Since the scheduling resources for reductions and ordered reductions now
account for LMUL and SEW, we can modify the Latency and ResourceCycles
for these resoruces.

* Most reductions take a total of approx `vl*SEW/DLEN + 5*(4 + log2(DLEN/SEW))`
  cycles.
* Ordered floating-point reductions take a total of approx `5*vl` cycles.

Differential Revision: https://reviews.llvm.org/D153474

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVSchedSiFive7.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 78a1e8464fa90..ee8548639ad88 100644

--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -160,6 +160,44 @@ class SiFive7GetDivOrSqrtFactor<int sew> {
   );
 }
 
+/// Cycles for reductions take approximately VL*SEW/DLEN + 5(4 + log(DLEN/SEW))
+/// cycles.
+class SiFive7GetReductionCycles<string mx, int sew> {
+  // VLUpperBound*SEW/DLEN is equivalent to 2*LMUL since
+  // VLUpperBound=(VLEN*LMUL)/SEW.
+  defvar VLEN = 512;
+  defvar DLEN = !div(VLEN, 2);
+  defvar TwoTimesLMUL = !cond(
+    !eq(mx, "M1") : 2,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+  int c = !add(
+    !div(TwoTimesLMUL, DLEN),
+    !mul(5, !add(4, !logtwo(!div(DLEN, sew))))
+  );
+}
+
+/// Cycles for ordered reductions take approximatley 5*VL cycles
+class SiFive7GetOrderedReductionCycles<string mx, int sew> {
+  defvar VLEN = 512;
+  // (VLEN * LMUL) / SEW
+  defvar VLUpperBound  = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+  int c = !mul(5, VLUpperBound);
+}
+
 // SiFive7 machine model for scheduling and other instruction cost heuristics.
 def SiFive7Model : SchedMachineModel {
   let MicroOpBufferSize = 0; // Explicitly set to zero since SiFive7 is in-order.
@@ -730,14 +768,55 @@ foreach mx = SchedMxListFW in {
 }
 
 // 14. Vector Reduction Operations
-let Latency = 32 in {
-defm "" : LMULSEWWriteRes<"WriteVIRedV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteRes<"WriteVIWRedV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteRes<"WriteVFRedV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteRes<"WriteVFRedOV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteResF<"WriteVFRedMinMaxV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteResFWRed<"WriteVFWRedV_From", [SiFive7VA]>;
-defm "" : LMULSEWWriteResFWRed<"WriteVFWRedOV_From", [SiFive7VA]>;
+foreach mx = SchedMxList in {
+  foreach sew = SchedSEWSet<mx>.val in {
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+    let Latency = Cycles, ResourceCycles = [Cycles] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFive7VA],
+                                   mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListWRed in {
+  foreach sew = SchedSEWSet<mx, 1>.val in {
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+    let Latency = Cycles, ResourceCycles = [Cycles] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFive7VA],
+                                   mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListF in {
+  foreach sew = SchedSEWSetF<mx>.val in {
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+    let Latency = RedCycles, ResourceCycles = [RedCycles] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFive7VA],
+                                     mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SiFive7VA],
+                                     mx, sew, IsWorstCase>;
+    }
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    let Latency = OrdRedCycles, ResourceCycles = [OrdRedCycles] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFive7VA],
+                                   mx, sew, IsWorstCase>;
+  }
+}
+
+foreach mx = SchedMxListFWRed in {
+  foreach sew = SchedSEWSetF<mx, 1>.val in {
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+    let Latency = RedCycles, ResourceCycles = [RedCycles] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SiFive7VA],
+                                   mx, sew, IsWorstCase>;
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    let Latency = OrdRedCycles, ResourceCycles = [OrdRedCycles] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFive7VA],
+                                   mx, sew, IsWorstCase>;
+  }
 }
 
 // 15. Vector Mask Instructions