[llvm] [RISC-V] Base scheduling model for tt-ascalon-d8 (PR #120160)

Mon Dec 16 21:38:25 PST 2024

================
@@ -0,0 +1,333 @@
+//=- RISCVSchedTTAscalonD8.td - Tenstorrent Ascalon Scheduling Defs -----*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+def TTAscalonD8Model : SchedMachineModel {
+  let IssueWidth        =   8; // 8-way decode and dispatch
+  let MicroOpBufferSize = 256; // 256 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling.
+  let LoopMicroOpBufferSize = 16;
+
+  let CompleteModel = 0;
+
+  // TODO supported, but haven't added scheduling info yet
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
+                             HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
+                             HasStdExtZkr, HasVInstructions, HasVInstructionsI64];
+}
+
+let SchedModel = TTAscalonD8Model in {
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+let BufferSize = 16 in {
+  def AscalonLS : ProcResource<3>;
+  def AscalonFXA : ProcResource<1>; // ALU, FP/VEC -> INT, MUL, DIV, CSR
+  def AscalonFXB : ProcResource<1>; // ALU, INT -> FP/VEC
+  def AscalonFXC : ProcResource<2>; // ALU, BR
+  def AscalonFXD : ProcResource<2>; // ALU
+  def AscalonFP : ProcResource<2>;
+  def AscalonV : ProcResource<2>;
+}
+
+def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
+
+//===----------------------------------------------------------------------===//
+
+// Branching
+def : WriteRes<WriteJmp, [AscalonFXC]>;
+def : WriteRes<WriteJal, [AscalonFXC]>;
+def : WriteRes<WriteJalr, [AscalonFXC]>;
+
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU32, [AscalonFX]>;
+def : WriteRes<WriteIALU, [AscalonFX]>;
+def : WriteRes<WriteShiftImm32, [AscalonFX]>;
+def : WriteRes<WriteShiftImm, [AscalonFX]>;
+def : WriteRes<WriteShiftReg32, [AscalonFX]>;
+def : WriteRes<WriteShiftReg, [AscalonFX]>;
+
+// Integer multiplication
+let Latency = 3 in {
+def : WriteRes<WriteIMul, [AscalonFXA]>;
+def : WriteRes<WriteIMul32, [AscalonFXA]>;
+}
+
+// Integer division
+// Worst case latency is used.
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+  def : WriteRes<WriteIDiv32, [AscalonFXA]>;
+  def : WriteRes<WriteIDiv, [AscalonFXA]>;
+  def : WriteRes<WriteIRem32, [AscalonFXA]>;
+  def : WriteRes<WriteIRem, [AscalonFXA]>;
+}
+
+// Bitmanip
+def : WriteRes<WriteRotateImm, [AscalonFX]>;
+def : WriteRes<WriteRotateImm32, [AscalonFX]>;
+def : WriteRes<WriteRotateReg, [AscalonFX]>;
+def : WriteRes<WriteRotateReg32, [AscalonFX]>;
+
+def : WriteRes<WriteCLZ, [AscalonFX]>;
+def : WriteRes<WriteCLZ32, [AscalonFX]>;
+def : WriteRes<WriteCTZ, [AscalonFX]>;
+def : WriteRes<WriteCTZ32, [AscalonFX]>;
+
+def : WriteRes<WriteCPOP, [AscalonFX]>;
----------------
ppenzin wrote:

Confirmed with our arch that it does indeed take one cycle in current implementation.

https://github.com/llvm/llvm-project/pull/120160