[llvm] [RISCV] Add scheduling model for SiFive P800 processors (PR #139316)

Mon May 19 11:19:51 PDT 2025

================
@@ -0,0 +1,1185 @@
+//==- RISCVSchedSiFiveP800.td - SiFiveP800 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP800, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP800IsWorstCaseMX<string mx, list<string> MxList> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP800IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+  string LLMUL = LargestLMUL<MxList>.r;
+  int SSEW = SmallestSEW<mx, isF>.r;
+  bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP800GetLMulCycles<string mx> {
+  int c = !cond(
+    !eq(mx, "M1") : 1,
+    !eq(mx, "M2") : 2,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    !eq(mx, "MF2") : 1,
+    !eq(mx, "MF4") : 1,
+    !eq(mx, "MF8") : 1
+  );
+}
+
+// Latency for segmented loads and stores are calculated as vl.
+class SiFiveP800GetCyclesSegmented<string mx, int sew> {
+  defvar VLEN = 128;
+  int c = !cond(
+    !eq(mx, "M1") : !div(VLEN, sew),
+    !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+    !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+    !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+    !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+    !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+    !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+  );
+}
+
+class SiFiveP800VSM3CCycles<string mx> {
+  // c = ceil(LMUL / 2)
+  int c = !cond(!eq(mx, "M2") : 1,
+                !eq(mx, "M4") : 2,
+                !eq(mx, "M8") : 4,
+                true : 1);
+}
+
+// SiFiveP800 machine model for scheduling and other instruction cost heuristics.
+def SiFiveP800Model : SchedMachineModel {
+  let IssueWidth = 6;          // 6 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 288; // Max micro-ops that can be buffered.
+  let LoadLatency = 4;         // Cycles for loads to access the cache.
+  let MispredictPenalty = 9;   // Extra cycles for a mispredicted branch.
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVendorXSfvqmaccqoq, HasVendorXSfvqmaccdod];
+  let CompleteModel = false;
+}
+
+let SchedModel = SiFiveP800Model in {
+
+def SiFiveP800IEXQ0       : ProcResource<1>;
+def SiFiveP800IEXQ1       : ProcResource<1>;
+def SiFiveP800IEXQ2       : ProcResource<1>;
+def SiFiveP800IEXQ3       : ProcResource<1>;
+def SiFiveP800IEXQ4       : ProcResource<1>;
+def SiFiveP800IEXQ5       : ProcResource<1>;
+def SiFiveP800FEXQ0       : ProcResource<1>;
+def SiFiveP800FEXQ1       : ProcResource<1>;
+
+// Two Load/Store ports that can issue either two loads, two stores, or one load
+// and one store.
+def SiFiveP800LDST       : ProcResource<2>;
+// One additional port that can only handle loads.
+def SiFiveP800LD         : ProcResource<1>;
+def SiFiveP800Load       : ProcResGroup<[SiFiveP800LDST, SiFiveP800LD]>;
+
+// 6-wide pipeline with 6 ALU pipes.
+def SiFiveP800IntArith    : ProcResGroup<[SiFiveP800IEXQ0, SiFiveP800IEXQ1, SiFiveP800IEXQ2, SiFiveP800IEXQ3]>;
+defvar SiFiveP800SYS      = SiFiveP800IEXQ1;
+defvar SiFiveP800CMOV     = SiFiveP800IEXQ3;
+defvar SiFiveP800I2F      = SiFiveP800IEXQ3;
+def SiFiveP800Mul         : ProcResGroup<[SiFiveP800IEXQ1, SiFiveP800IEXQ3]>;
+def SiFiveP800Branch      : ProcResGroup<[SiFiveP800IEXQ4, SiFiveP800IEXQ5]>;
+def SiFiveP800Div         : ProcResource<1>;
----------------
topperc wrote:

The divider blocks other divides for multiple cycles while it iterates, but only blocks non-divides for the issue cycle and writeback conflict. So we use an IEX resource for 1 cycle and a divider resource for multiple cycles to model this.

AArch64SchedCyclone.td does something similar, though they used 2 cycles for the issue and the writeback.

```
def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {                            
  let Latency = 10;                                                              
  let ReleaseAtCycles = [2, 10];                                                 
}
```

X86ScheduleZnver1.td also does something similar, but only used 1 cycle for ZnALU2.

```
/ IDIV                                                                          
defm : ZnWriteResPair<WriteDiv8,   [ZnALU2, ZnDivider], 15, [1,15], 1>;          
defm : ZnWriteResPair<WriteDiv16,  [ZnALU2, ZnDivider], 17, [1,17], 2>;          
defm : ZnWriteResPair<WriteDiv32,  [ZnALU2, ZnDivider], 25, [1,25], 2>;          
defm : ZnWriteResPair<WriteDiv64,  [ZnALU2, ZnDivider], 41, [1,41], 2>;          
defm : ZnWriteResPair<WriteIDiv8,  [ZnALU2, ZnDivider], 15, [1,15], 1>;          
defm : ZnWriteResPair<WriteIDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>;          
defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;          
defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
```

https://github.com/llvm/llvm-project/pull/139316