[llvm] [RISCV] Add scheduling model for SiFive P800 processors (PR #139316)

Mon May 19 16:24:20 PDT 2025

================
@@ -0,0 +1,1185 @@
+//==- RISCVSchedSiFiveP800.td - SiFiveP800 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP800, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP800IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP800IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP800GetLMulCycles<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+}
+
+// Latency for segmented loads and stores are calculated as vl.
+class SiFiveP800GetCyclesSegmented<string mx, int sew> {
+  defvar VLEN = 128;
+  int c = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+}
+
+class SiFiveP800VSM3CCycles<string mx> {
+  // c = ceil(LMUL / 2)
+  int c = !cond(!eq(mx, "M2") : 1,
+                !eq(mx, "M4") : 2,
+                !eq(mx, "M8") : 4,
+                true : 1);
+}
+
+// SiFiveP800 machine model for scheduling and other instruction cost heuristics.
+def SiFiveP800Model : SchedMachineModel {
+  let IssueWidth = 6;          // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 288; // Max micro-ops that can be buffered.
+  let LoadLatency = 4;         // Cycles for loads to access the cache.
+  let MispredictPenalty = 9;   // Extra cycles for a mispredicted branch.
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVendorXSfvqmaccqoq, HasVendorXSfvqmaccdod];
+  let CompleteModel = false;
+}
+
+let SchedModel = SiFiveP800Model in {
+
+def SiFiveP800IEXQ0       : ProcResource<1>;
+def SiFiveP800IEXQ1       : ProcResource<1>;
+def SiFiveP800IEXQ2       : ProcResource<1>;
+def SiFiveP800IEXQ3       : ProcResource<1>;
+def SiFiveP800IEXQ4       : ProcResource<1>;
+def SiFiveP800IEXQ5       : ProcResource<1>;
+def SiFiveP800FEXQ0       : ProcResource<1>;
+def SiFiveP800FEXQ1       : ProcResource<1>;
+
+// Two Load/Store ports that can issue either two loads, two stores, or one load
+// and one store.
+def SiFiveP800LDST       : ProcResource<2>;
+// One additional port that can only handle loads.
+def SiFiveP800LD         : ProcResource<1>;
+def SiFiveP800Load       : ProcResGroup<[SiFiveP800LDST, SiFiveP800LD]>;
+
+// 6-wide pipeline with 6 ALU pipes.
+def SiFiveP800IntArith    : ProcResGroup<[SiFiveP800IEXQ0, SiFiveP800IEXQ1, SiFiveP800IEXQ2, SiFiveP800IEXQ3]>;
+defvar SiFiveP800SYS      = SiFiveP800IEXQ1;
+defvar SiFiveP800CMOV     = SiFiveP800IEXQ3;
+defvar SiFiveP800I2F      = SiFiveP800IEXQ3;
+def SiFiveP800Mul         : ProcResGroup<[SiFiveP800IEXQ1, SiFiveP800IEXQ3]>;
+def SiFiveP800Branch      : ProcResGroup<[SiFiveP800IEXQ4, SiFiveP800IEXQ5]>;
+def SiFiveP800Div         : ProcResource<1>;
+
+def SiFiveP800FloatArith  : ProcResGroup<[SiFiveP800FEXQ0, SiFiveP800FEXQ1]>;
+defvar SiFiveP800F2I      = SiFiveP800FEXQ0;
+def SiFiveP800FloatDiv    : ProcResource<1>;
+
+// Vector pipeline
+// VEXQ0 handle Mask, Simple Slide instructions,
+// VEXQ1 handle Complex Slide, Permutation, Reductions, Divide instructions.
+// Other vector instructions can be done in VEXQ0 and VEXQ1.
+def SiFiveP800VEXQ0        : ProcResource<1>;
+def SiFiveP800VEXQ1        : ProcResource<1>;
+def SiFiveP800VectorArith  : ProcResGroup<[SiFiveP800VEXQ0, SiFiveP800VEXQ1]>;
+
+def SiFiveP800VLD          : ProcResource<1>;
+def SiFiveP800VST          : ProcResource<1>;
----------------
mshockwave wrote:

if you're asking if we can merge VLD/VST with scalar LSU, vector memory ops in P800 don't share the exact same resources with their scalar counterparts, so it makes more sense to have separate issue ports here.

https://github.com/llvm/llvm-project/pull/139316