[llvm] [RISCV] Add sched model for XiangShan-NanHu (PR #70232)

Thu Nov 16 19:12:53 PST 2023

================
@@ -0,0 +1,308 @@
+//==- RISCVSchedXiangShanNanHu.td - XiangShan-NanHu Scheduling Definitions --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------------------------===//
+
+//===-------------------------------------------------------------------------------------===//
+
+// XiangShan is a high-performance open-source RISC-V processor developed by
+// the Institute of Computing Technology (ICT), Chinese Academy of Sciences.
+// Source: https://github.com/OpenXiangShan/XiangShan
+// Documentation: https://github.com/OpenXiangShan/XiangShan-doc
+
+// XiangShan-NanHu is the second generation of XiangShan processor series.
+// Overview: https://xiangshan-doc.readthedocs.io/zh-cn/latest/integration/overview/
+
+def XiangShanNanHuModel : SchedMachineModel {
+  let MicroOpBufferSize = 256;
+  let LoopMicroOpBufferSize = 48;  // Instruction queue size
+  let IssueWidth = 6;  // 6-way decode and dispatch
+  let LoadLatency = 4;
+  let MispredictPenalty = 11; // Based on estimate of pipeline depth.
+  let CompleteModel = 0;
+  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+  let UnsupportedFeatures = [HasStdExtZcmt, HasStdExtZkr, HasVInstructions,
+                             HasVInstructionsI64];
+}
+
+let SchedModel = XiangShanNanHuModel in {
+
+// The reservation stations are distributed and grouped as 32-entry or 16-entry smaller ones.
+let BufferSize = 16 in {
+  def XS2ALU : ProcResource<4>;
+  def XS2MDU : ProcResource<2>;
+  def XS2MISC : ProcResource<1>;
+
+  def XS2FMAC : ProcResource<4>;
+  def XS2FMISC : ProcResource<2>;
+
+  // Load/Store queues are ignored.
+  def XS2LD : ProcResource<2>;
+  def XS2ST : ProcResource<2>;
+}
+
+// Branching
+def : WriteRes<WriteJmp, [XS2MISC]>;
+def : WriteRes<WriteJal, [XS2MISC]>;
+def : WriteRes<WriteJalr, [XS2MISC]>;
+
+// Integer arithmetic and logic
+let Latency = 1 in {
+def : WriteRes<WriteIALU, [XS2ALU]>;
+def : WriteRes<WriteIALU32, [XS2ALU]>;
+def : WriteRes<WriteShiftImm, [XS2ALU]>;
+def : WriteRes<WriteShiftImm32, [XS2ALU]>;
+def : WriteRes<WriteShiftReg, [XS2ALU]>;
+def : WriteRes<WriteShiftReg32, [XS2ALU]>;
+}
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [XS2MDU]>;
+def : WriteRes<WriteIMul32, [XS2MDU]>;
+}
+
+// Integer division
+// SRT16 algorithm
+let Latency = 20, ReleaseAtCycles = [20] in {
+def : WriteRes<WriteIDiv32, [XS2MDU]>;
+def : WriteRes<WriteIDiv, [XS2MDU]>;
+}
+
+// Zb*
+let Latency = 1 in {
+// Zba
+def : WriteRes<WriteSHXADD, [XS2ALU]>;
+def : WriteRes<WriteSHXADD32, [XS2ALU]>;
+
+// Zbb
+def : WriteRes<WriteRotateImm, [XS2ALU]>;
+def : WriteRes<WriteRotateImm32, [XS2ALU]>;
+def : WriteRes<WriteRotateReg, [XS2ALU]>;
+def : WriteRes<WriteRotateReg32, [XS2ALU]>;
+def : WriteRes<WriteORCB, [XS2ALU]>;
+def : WriteRes<WriteREV8, [XS2ALU]>;
+
+// Zbkb
+def : WriteRes<WriteBREV8, [XS2ALU]>;
+def : WriteRes<WritePACK, [XS2ALU]>;
+def : WriteRes<WritePACK32, [XS2ALU]>;
+def : WriteRes<WriteZIP, [XS2ALU]>;
+}
+
+let Latency = 3 in {
+// Zbb
+def : WriteRes<WriteCLZ, [XS2MDU]>;
+def : WriteRes<WriteCLZ32, [XS2MDU]>;
+def : WriteRes<WriteCTZ, [XS2MDU]>;
+def : WriteRes<WriteCTZ32, [XS2MDU]>;
+def : WriteRes<WriteCPOP, [XS2MDU]>;
+def : WriteRes<WriteCPOP32, [XS2MDU]>;
+
+// Zbs
+def : WriteRes<WriteSingleBit, [XS2MDU]>;
----------------
poemonsense wrote:

Yes I agree their logics are simple. I think the main reason we were implementing them using higher latencies is the ALU resources (v.s. the use frequency of these instructions). The current ALU design incorporates too many operations, which hurts timing. KMH could re-consider its detailed design to handle these less used instructions. I'm not sure on the schedule of KMH, but possibly @huxuan0307 can see whether KMH can optimize these instructions in the future

https://github.com/llvm/llvm-project/pull/70232