[llvm] [RISCV] Add scheduling model for SiFive P800 processors (PR #139316)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Sun May 11 21:17:38 PDT 2025
================
@@ -0,0 +1,1184 @@
+//==- RISCVSchedSiFiveP800.td - SiFiveP800 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+/// c is true if mx has the worst case behavior compared to LMULs in MxList.
+/// On the SiFiveP800, the worst case LMUL is the Largest LMUL
+/// and the worst case sew is the smallest SEW for that LMUL.
+class SiFiveP800IsWorstCaseMX<string mx, list<string> MxList> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+class SiFiveP800IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
+ string LLMUL = LargestLMUL<MxList>.r;
+ int SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+// 1 Micro-Op per cycle.
+class SiFiveP800GetLMulCycles<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 1,
+ !eq(mx, "M2") : 2,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ !eq(mx, "MF2") : 1,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+}
+
+// Latency for segmented loads and stores are calculated as vl.
+class SiFiveP800GetCyclesSegmented<string mx, int sew> {
+ defvar VLEN = 128;
+ int c = !cond(
+ !eq(mx, "M1") : !div(VLEN, sew),
+ !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
+ !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
+ !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
+ !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
+ !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
+ !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
+ );
+}
+
+class SiFiveP800VSM3CCycles<string mx> {
+ // c = ceil(LMUL / 2)
+ int c = !cond(!eq(mx, "M2") : 1,
+ !eq(mx, "M4") : 2,
+ !eq(mx, "M8") : 4,
+ true : 1);
+}
+
+// SiFiveP800 machine model for scheduling and other instruction cost heuristics.
+def SiFiveP800Model : SchedMachineModel {
+ let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
+ let MicroOpBufferSize = 288; // Max micro-ops that can be buffered.
+ let LoadLatency = 4; // Cycles for loads to access the cache.
+ let MispredictPenalty = 9; // Extra cycles for a mispredicted branch.
+ let PostRAScheduler = true;
+ let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+ HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+ HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+ HasVendorXSfvqmaccqoq, HasVendorXSfvqmaccdod];
+ let CompleteModel = false;
+}
+
+let SchedModel = SiFiveP800Model in {
+
+def SiFiveP800IEXQ0 : ProcResource<1>;
+def SiFiveP800IEXQ1 : ProcResource<1>;
+def SiFiveP800IEXQ2 : ProcResource<1>;
+def SiFiveP800IEXQ3 : ProcResource<1>;
+def SiFiveP800IEXQ4 : ProcResource<1>;
+def SiFiveP800IEXQ5 : ProcResource<1>;
+def SiFiveP800FEXQ0 : ProcResource<1>;
+def SiFiveP800FEXQ1 : ProcResource<1>;
+
+// Two Load/Store ports that can issue either two loads, two stores, or one load
+// and one store.
+def SiFiveP800LDST : ProcResource<2>;
+// One additional port that can only handle loads.
+def SiFiveP800LD : ProcResource<1>;
+def SiFiveP800Load : ProcResGroup<[SiFiveP800LDST, SiFiveP800LD]>;
+
+// 6-wide pipeline with 6 ALU pipes.
+def SiFiveP800IntArith : ProcResGroup<[SiFiveP800IEXQ0, SiFiveP800IEXQ1, SiFiveP800IEXQ2, SiFiveP800IEXQ3]>;
+defvar SiFiveP800SYS = SiFiveP800IEXQ1;
+defvar SiFiveP800CMOV = SiFiveP800IEXQ3;
+defvar SiFiveP800I2F = SiFiveP800IEXQ3;
+def SiFiveP800Mul : ProcResGroup<[SiFiveP800IEXQ1, SiFiveP800IEXQ3]>;
+def SiFiveP800Branch : ProcResGroup<[SiFiveP800IEXQ4, SiFiveP800IEXQ5]>;
+def SiFiveP800Div : ProcResource<1>;
+
+def SiFiveP800FloatArith : ProcResGroup<[SiFiveP800FEXQ0, SiFiveP800FEXQ1]>;
+defvar SiFiveP800F2I = SiFiveP800FEXQ0;
+def SiFiveP800FloatDiv : ProcResource<1>;
+
+// Vector pipeline
+// VEXQ0 handle Mask, Simple Slide instructions,
+// VEXQ1 handle Complex Slide, Permutation, Reductions, Divide instructions.
+// Other vector instructions can be done in VEXQ0 and VEXQ1.
+def SiFiveP800VEXQ0 : ProcResource<1>;
+def SiFiveP800VEXQ1 : ProcResource<1>;
+def SiFiveP800VectorArith : ProcResGroup<[SiFiveP800VEXQ0, SiFiveP800VEXQ1]>;
+
+def SiFiveP800VLD : ProcResource<1>;
+def SiFiveP800VST : ProcResource<1>;
+def SiFiveP800VDiv : ProcResource<1>;
+def SiFiveP800VFloatDiv : ProcResource<1>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU, [SiFiveP800IntArith]>;
+def : WriteRes<WriteIALU32, [SiFiveP800IntArith]>;
+def : WriteRes<WriteShiftImm, [SiFiveP800IntArith]>;
+def : WriteRes<WriteShiftImm32, [SiFiveP800IntArith]>;
+def : WriteRes<WriteShiftReg, [SiFiveP800IntArith]>;
+def : WriteRes<WriteShiftReg32, [SiFiveP800IntArith]>;
+// Branching
+def : WriteRes<WriteJmp, [SiFiveP800Branch]>;
+def : WriteRes<WriteJal, [SiFiveP800Branch]>;
+def : WriteRes<WriteJalr, [SiFiveP800Branch]>;
+
+// CMOV
+def P800WriteCMOV : SchedWriteRes<[SiFiveP800Branch, SiFiveP800CMOV]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : InstRW<[P800WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
+
+let Latency = 2 in {
+// Integer multiplication
+def : WriteRes<WriteIMul, [SiFiveP800Mul]>;
+def : WriteRes<WriteIMul32, [SiFiveP800Mul]>;
+// cpop[w] look exactly like multiply.
+def : WriteRes<WriteCPOP, [SiFiveP800Mul]>;
+def : WriteRes<WriteCPOP32, [SiFiveP800Mul]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFiveP800IEXQ2, SiFiveP800Div]> {
+ let Latency = 35;
+ let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIDiv32, [SiFiveP800IEXQ2, SiFiveP800Div]> {
+ let Latency = 20;
+ let ReleaseAtCycles = [1, 19];
+}
+
+// Integer remainder
+def : WriteRes<WriteIRem, [SiFiveP800IEXQ2, SiFiveP800Div]> {
+ let Latency = 35;
+ let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIRem32, [SiFiveP800IEXQ2, SiFiveP800Div]> {
+ let Latency = 20;
+ let ReleaseAtCycles = [1, 19];
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [SiFiveP800IntArith]>;
+def : WriteRes<WriteRotateImm32, [SiFiveP800IntArith]>;
+def : WriteRes<WriteRotateReg, [SiFiveP800IntArith]>;
+def : WriteRes<WriteRotateReg32, [SiFiveP800IntArith]>;
+
+def : WriteRes<WriteCLZ, [SiFiveP800IntArith]>;
+def : WriteRes<WriteCLZ32, [SiFiveP800IntArith]>;
+def : WriteRes<WriteCTZ, [SiFiveP800IntArith]>;
+def : WriteRes<WriteCTZ32, [SiFiveP800IntArith]>;
+
+def : WriteRes<WriteORCB, [SiFiveP800IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP800IntArith]>;
+
+def : WriteRes<WriteREV8, [SiFiveP800IntArith]>;
+
+def : WriteRes<WriteSHXADD, [SiFiveP800IntArith]>;
+def : WriteRes<WriteSHXADD32, [SiFiveP800IntArith]>;
+
+def : WriteRes<WriteSingleBit, [SiFiveP800IntArith]>;
+def : WriteRes<WriteSingleBitImm, [SiFiveP800IntArith]>;
+def : WriteRes<WriteBEXT, [SiFiveP800IntArith]>;
+def : WriteRes<WriteBEXTI, [SiFiveP800IntArith]>;
+
+// Memory
+def : WriteRes<WriteSTB, [SiFiveP800LDST]>;
+def : WriteRes<WriteSTH, [SiFiveP800LDST]>;
+def : WriteRes<WriteSTW, [SiFiveP800LDST]>;
+def : WriteRes<WriteSTD, [SiFiveP800LDST]>;
+def : WriteRes<WriteFST16, [SiFiveP800LDST]>;
+def : WriteRes<WriteFST32, [SiFiveP800LDST]>;
+def : WriteRes<WriteFST64, [SiFiveP800LDST]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [SiFiveP800Load]>;
+def : WriteRes<WriteLDH, [SiFiveP800Load]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteLDW, [SiFiveP800Load]>;
+def : WriteRes<WriteLDD, [SiFiveP800Load]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFLD16, [SiFiveP800Load]>;
+def : WriteRes<WriteFLD32, [SiFiveP800Load]>;
+def : WriteRes<WriteFLD64, [SiFiveP800Load]>;
+}
+
+// Atomic memory
+let Latency = 3 in {
+def : WriteRes<WriteAtomicSTW, [SiFiveP800LDST]>;
+def : WriteRes<WriteAtomicSTD, [SiFiveP800LDST]>;
+def : WriteRes<WriteAtomicW, [SiFiveP800LDST]>;
+def : WriteRes<WriteAtomicD, [SiFiveP800LDST]>;
+def : WriteRes<WriteAtomicLDW, [SiFiveP800Load]>;
+def : WriteRes<WriteAtomicLDD, [SiFiveP800Load]>;
+}
+
+// Floating point
+let Latency = 2 in {
+def : WriteRes<WriteFAdd16, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFAdd32, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFAdd64, [SiFiveP800FloatArith]>;
+}
+let Latency = 3 in {
+def : WriteRes<WriteFMul16, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMul32, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMul64, [SiFiveP800FloatArith]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteFMA16, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMA32, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMA64, [SiFiveP800FloatArith]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ16, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFSGNJ32, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFSGNJ64, [SiFiveP800FloatArith]>;
+
+def : WriteRes<WriteFMinMax16, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMinMax32, [SiFiveP800FloatArith]>;
+def : WriteRes<WriteFMinMax64, [SiFiveP800FloatArith]>;
+}
+
+// Half precision.
+def : WriteRes<WriteFDiv16, [SiFiveP800FEXQ1, SiFiveP800FloatDiv]> {
+ let Latency = 4;
+ let ReleaseAtCycles = [1, 4];
+}
+def : WriteRes<WriteFSqrt16, [SiFiveP800FEXQ1, SiFiveP800FloatDiv]> {
+ let Latency = 18;
----------------
wangpc-pp wrote:
Half is the same as Single?
https://github.com/llvm/llvm-project/pull/139316
More information about the llvm-commits
mailing list