[llvm] [RISCV] Add scheduler definitions for SpacemiT-X60 (PR #137343)

Mon May 5 09:11:07 PDT 2025

================
@@ -0,0 +1,353 @@
+//=- RISCVSchedSpacemitX60.td - Spacemit X60 Scheduling Defs -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Scheduler model for the SpacemiT-X60 processor based on documentation of the
+// C908 and experiments on real hardware (bpi-f3).
+//
+//===----------------------------------------------------------------------===//
+
+def SpacemitX60Model : SchedMachineModel {
+  let IssueWidth        = 2; // dual-issue
+  let MicroOpBufferSize = 0; // in-order
+  let LoadLatency       = 5; // worse case: >= 3
+  let MispredictPenalty = 9; // nine-stage
+
+  let CompleteModel = 0;
+
+  let UnsupportedFeatures = [HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr];
+}
+
+let SchedModel = SpacemitX60Model in {
+
+//===----------------------------------------------------------------------===//
+// Define processor resources for Spacemit-X60
+
+// Information gathered from the C908 user manual:
+let BufferSize = 0 in {
+  // The LSU supports dual issue for scalar store/load instructions
+  def SMX60_LS : ProcResource<2>;
+
+  // An IEU can decode and issue two instructions at the same time
+  def SMX60_IEUA : ProcResource<1>;
+  def SMX60_IEUB : ProcResource<1>;
+  def SMX60_IEU : ProcResGroup<[SMX60_IEUA, SMX60_IEUB]>;
+
+  // Although the X60 does appear to support multiple issue for at least some
+  // floating point instructions, this model assumes single issue as
+  // increasing it reduces the gains we saw in performance
+  def SMX60_FP : ProcResource<1>;
+}
+
+//===----------------------------------------------------------------------===//
+
+// Branching
+def : WriteRes<WriteJmp, [SMX60_IEUA]>;
+def : WriteRes<WriteJal, [SMX60_IEUA]>;
+def : WriteRes<WriteJalr, [SMX60_IEUA]>;
+
+// Integer arithmetic and logic
+// Latency of ALU instructions is 1, but add.uw is 2
+def : WriteRes<WriteIALU32, [SMX60_IEU]>;
+def : WriteRes<WriteIALU, [SMX60_IEU]>;
+def : WriteRes<WriteShiftImm32, [SMX60_IEU]>;
+def : WriteRes<WriteShiftImm, [SMX60_IEU]>;
+def : WriteRes<WriteShiftReg32, [SMX60_IEU]>;
+def : WriteRes<WriteShiftReg, [SMX60_IEU]>;
+
+// Integer multiplication
+def : WriteRes<WriteIMul32, [SMX60_IEU]>  { let Latency = 3; }
+
+// The latency of mul is 5, while in mulh, mulhsu, mulhu is 6
+// Worst case latency is used
+def : WriteRes<WriteIMul, [SMX60_IEU]>  { let Latency = 6; }
+
+// Integer division/remainder
+// TODO: Latency set based on C908 datasheet and hasn't been
+// confirmed experimentally.
+let Latency = 12, ReleaseAtCycles = [12] in {
+  def : WriteRes<WriteIDiv32, [SMX60_IEUA]>;
+  def : WriteRes<WriteIRem32, [SMX60_IEUA]>;
+}
+let Latency = 20, ReleaseAtCycles = [20] in {
+  def : WriteRes<WriteIDiv, [SMX60_IEUA]>;
+  def : WriteRes<WriteIRem, [SMX60_IEUA]>;
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [SMX60_IEU]>;
+def : WriteRes<WriteRotateImm32, [SMX60_IEU]>;
+def : WriteRes<WriteRotateReg, [SMX60_IEU]>;
+def : WriteRes<WriteRotateReg32, [SMX60_IEU]>;
+
+def : WriteRes<WriteCLZ, [SMX60_IEU]>;
+def : WriteRes<WriteCLZ32, [SMX60_IEU]>;
+def : WriteRes<WriteCTZ, [SMX60_IEU]>;
+def : WriteRes<WriteCTZ32, [SMX60_IEU]>;
+
+let Latency = 2 in {
+  def : WriteRes<WriteCPOP, [SMX60_IEU]>;
+  def : WriteRes<WriteCPOP32, [SMX60_IEU]>;
+}
+
+def : WriteRes<WriteORCB, [SMX60_IEU]>;
+def : WriteRes<WriteIMinMax, [SMX60_IEU]>;
+def : WriteRes<WriteREV8, [SMX60_IEU]>;
+
+let Latency = 2 in {
----------------
preames wrote:

As a follow up (as in, *not* in this patch), it would be interesting to explore if this is actually two cycle latency, or if this is micro-coded as two uops, each with latency one.  You could maybe see this in perf counters.

https://github.com/llvm/llvm-project/pull/137343