[llvm] [AArch64] Initial Olympus scheduling model. (PR #171607)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 18 08:42:39 PST 2025
================
@@ -0,0 +1,3377 @@
+//=- AArch64SchedOlympus.td - Olympus Scheduling Defs --------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for the NVIDIA Olympus processors.
+// All information is taken from the Olympus Software Optimisation guide:
+//
+// https://docs.nvidia.com/olympus-cpu-core-software-optimization-guide-dp12531-001v0-7.pdf
+//
+//===----------------------------------------------------------------------===//
+
+def OlympusModel : SchedMachineModel {
+ // NOTE: Unless otherwise stated, values unspecified in the Olympus SWOG are
+ // copied from the Neoverse V2 model.
+ let IssueWidth = 10; // Maximum macro-ops dispatched per cycle.
+ let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 10; // Extra cycles for mispredicted branch.
+ let LoopMicroOpBufferSize = 16; // NOTE: Copied from Cortex-A57.
+ let CompleteModel = 1;
+
+ list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
+ [HasSVE2p1, HasSVEB16B16,
+ HasCPA, HasCSSC,
+ HasMatMulFP64]);
+}
+
+//===----------------------------------------------------------------------===//
+// In the Olympus core, instructions are first fetched and decoded into
+// internal macro-ops (MOps). Those MOps are then renamed and dispatched to the
+// out-of-order portion of the core. A MOp can be split into two micro-ops
+// (uOps) further down the pipeline after instruction decode. Once dispatched,
+// a uOp waits for its operands to become available and issues out-of-order to
+// one of many execution pipelines. Each execution pipeline can accept one uOp
+// per cycle.
+
+let SchedModel = OlympusModel in {
+
+// Note: The RCU is not specified in the SWOG, therefore we assume we can commit
+// as many MOps as we can dispatch each cycle.
+def OlympusRCU : RetireControlUnit<OlympusModel.MicroOpBufferSize, 10>;
+
+// Define the issue ports.
+def OlympusUnitB : ProcResource<4>; // Branch 0/1/2/3
+def OlympusUnitS : ProcResource<6>; // Integer single-cycle 0/1/2/3/4/5
+def OlympusUnitM0 : ProcResource<1>; // Integer single/multicycle 0
+def OlympusUnitM1 : ProcResource<1>; // Integer single/multicycle 1
+def OlympusUnitL : ProcResource<4>; // Load 0/1/2/3
+def OlympusUnitSA : ProcResource<2>; // Store 0/1
+def OlympusUnitD : ProcResource<2>; // Integer store data 0/1
+def OlympusUnitV0 : ProcResource<1>; // FP/ASIMD 0
+def OlympusUnitV1 : ProcResource<1>; // FP/ASIMD 1
+def OlympusUnitV2 : ProcResource<1>; // FP/ASIMD 2
+def OlympusUnitV3 : ProcResource<1>; // FP/ASIMD 3
+def OlympusUnitV45 : ProcResource<2>; // FP/ASIMD 4/5
+def OlympusUnitF : ProcResource<6>; // Flags
+
+def OlympusUnitM : ProcResGroup<[OlympusUnitM0, OlympusUnitM1]>; // Integer single/multicycle 0/1
+def OlympusUnitI : ProcResGroup<[OlympusUnitS, OlympusUnitM0, OlympusUnitM1]>; // Integer single-cycle 0/1/2/3/4/5 and single/multicycle 0/1
+def OlympusUnitV03 : ProcResGroup<[OlympusUnitV0, OlympusUnitV3]>; // FP/ASIMD 0/3
+def OlympusUnitV12 : ProcResGroup<[OlympusUnitV1, OlympusUnitV2]>; // FP/ASIMD 1/2
+def OlympusUnitV0123 : ProcResGroup<[OlympusUnitV0, OlympusUnitV1, OlympusUnitV2, OlympusUnitV3]>; // FP/ASIMD 0/1/2/3 (also used for vector store data)
+def OlympusUnitV : ProcResGroup<[OlympusUnitV0, OlympusUnitV1, OlympusUnitV2, OlympusUnitV3, OlympusUnitV45]>; // FP/ASIMD 0/1/2/3/4/5
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadST, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+def : WriteRes<WriteLDHi, []> { let Latency = 4; }
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Olympus.
+
+// Define generic 0 micro-op types
+
+let NumMicroOps = 0 in {
+ def OlympusWrite_0c : SchedWriteRes<[]> { let Latency = 0; }
+ def OlympusWrite_6c : SchedWriteRes<[]> { let Latency = 6; }
+} // NumMicroOps = 0
+
+// Define generic 1 micro-op types
+
+def OlympusWrite_1c_1B : SchedWriteRes<[OlympusUnitB]> { let Latency = 1; }
+def OlympusWrite_1c_1I : SchedWriteRes<[OlympusUnitI]> { let Latency = 1; }
+def OlympusWrite_1c_1M : SchedWriteRes<[OlympusUnitM]> { let Latency = 1; }
+def OlympusWrite_1c_1M0 : SchedWriteRes<[OlympusUnitM0]> { let Latency = 1; }
+def OlympusWrite_1c_1L : SchedWriteRes<[OlympusUnitL]> { let Latency = 1; }
+def OlympusWrite_2c_1M : SchedWriteRes<[OlympusUnitM]> { let Latency = 2; }
+def OlympusWrite_2c_1M0 : SchedWriteRes<[OlympusUnitM0]> { let Latency = 2; }
+def OlympusWrite_2c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 2; }
+def OlympusWrite_2c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 2; }
+def OlympusWrite_2c_1V0123 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 2; }
+def OlympusWrite_2c_1V03 : SchedWriteRes<[OlympusUnitV03]> { let Latency = 2; }
+def OlympusWrite_2c_1V1 : SchedWriteRes<[OlympusUnitV1]> { let Latency = 2; }
+def OlympusWrite_3c_1M : SchedWriteRes<[OlympusUnitM]> { let Latency = 3; }
+def OlympusWrite_3c_1M0 : SchedWriteRes<[OlympusUnitM0]> { let Latency = 3; }
+def OlympusWrite_3c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 3; }
+def OlympusWrite_3c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 3; }
+def OlympusWrite_3c_1V0123 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 3; }
+def OlympusWrite_3c_1V03 : SchedWriteRes<[OlympusUnitV03]> { let Latency = 3; }
+def OlympusWrite_3c_1V1 : SchedWriteRes<[OlympusUnitV1]> { let Latency = 3; }
+def OlympusWrite_3c_1V12 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 3; }
+def OlympusWrite_4c_1M0 : SchedWriteRes<[OlympusUnitM0]> { let Latency = 4; }
+def OlympusWrite_4c_1L : SchedWriteRes<[OlympusUnitL]> { let Latency = 4; }
+def OlympusWrite_4c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 4; }
+def OlympusWrite_4c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 4; }
+def OlympusWrite_4c_1V0123 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 4; }
+def OlympusWrite_4c_1V03 : SchedWriteRes<[OlympusUnitV03]> { let Latency = 4; }
+def OlympusWrite_4c_1V1 : SchedWriteRes<[OlympusUnitV1]> { let Latency = 4; }
+def OlympusWrite_5c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 5; }
+def OlympusWrite_6c_1L : SchedWriteRes<[OlympusUnitL]> { let Latency = 6; }
+def OlympusWrite_6c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 6; }
+def OlympusWrite_6c_1V1 : SchedWriteRes<[OlympusUnitV1]> { let Latency = 6; }
+def OlympusWrite_6c_1V12 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 6; }
+def OlympusWrite_7c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 7; let ReleaseAtCycles = [7]; }
+def OlympusWrite_8c_1V12 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 8; }
+def OlympusWrite_9c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 9; }
+def OlympusWrite_10c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 10; }
+def OlympusWrite_10c_1V1 : SchedWriteRes<[OlympusUnitV1]> { let Latency = 10; }
+def OlympusWrite_12c_1M : SchedWriteRes<[OlympusUnitM]> { let Latency = 12; let ReleaseAtCycles = [12]; }
+def OlympusWrite_12c_1V : SchedWriteRes<[OlympusUnitV]> { let Latency = 12; }
+def OlympusWrite_12c_1V45 : SchedWriteRes<[OlympusUnitV45]> { let Latency = 12; let ReleaseAtCycles = [12]; }
+def OlympusWrite_13c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 13; }
+def OlympusWrite_13c_1V12 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 13; }
+def OlympusWrite_15c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 15; }
+def OlympusWrite_16c_1V0 : SchedWriteRes<[OlympusUnitV0]> { let Latency = 16; }
+def OlympusWrite_20c_1M : SchedWriteRes<[OlympusUnitM]> { let Latency = 20; let ReleaseAtCycles = [20]; }
+def OlympusWrite_20c_1V45 : SchedWriteRes<[OlympusUnitV45]> { let Latency = 20; let ReleaseAtCycles = [20]; }
+
+// These types are multi-pumped.
+def OlympusWrite_4c_1V0123_2 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 4; let ReleaseAtCycles = [2]; }
+def OlympusWrite_5c_1V0123_2 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 5; let ReleaseAtCycles = [2]; }
+def OlympusWrite_6c_1V0123_4 : SchedWriteRes<[OlympusUnitV0123]> { let Latency = 6; let ReleaseAtCycles = [4]; }
+def OlympusWrite_7c_1V03_6 : SchedWriteRes<[OlympusUnitV03]> { let Latency = 7; let ReleaseAtCycles = [6]; }
+def OlympusWrite_9c_1V12_2 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 9; let ReleaseAtCycles = [2]; }
+def OlympusWrite_9c_1V12_4 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 9; let ReleaseAtCycles = [4]; }
+def OlympusWrite_11c_1V03_10 : SchedWriteRes<[OlympusUnitV03]> { let Latency = 11; let ReleaseAtCycles = [10]; }
+def OlympusWrite_11c_1V12_4 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 11; let ReleaseAtCycles = [4]; }
+def OlympusWrite_13c_1V12_8 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 13; let ReleaseAtCycles = [8]; }
+def OlympusWrite_14c_1V12_2 : SchedWriteRes<[OlympusUnitV12]> { let Latency = 14; let ReleaseAtCycles = [2]; }
+
+// Hack to get the flagset throughputs right.
+def OlympusWrite_1c_1F : SchedWriteRes<[OlympusUnitI, OlympusUnitF]> { let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define generic 2 micro-op types
+
+let NumMicroOps = 2 in {
+
+ def OlympusWrite_1c_1I_1B : SchedWriteRes<[OlympusUnitI, OlympusUnitB]> { let Latency = 1; }
+ def OlympusWrite_1c_1M0_1M : SchedWriteRes<[OlympusUnitM0, OlympusUnitM]> { let Latency = 1; }
+ def OlympusWrite_1c_1SA_1D : SchedWriteRes<[OlympusUnitSA, OlympusUnitD]> { let Latency = 1; }
+ def OlympusWrite_1c_2M : SchedWriteRes<[OlympusUnitM, OlympusUnitM]> { let Latency = 1; }
+ def OlympusWrite_2c_1L_1V : SchedWriteRes<[OlympusUnitL, OlympusUnitV]> { let Latency = 2; }
+ def OlympusWrite_2c_1M0_1M : SchedWriteRes<[OlympusUnitM0, OlympusUnitM]> { let Latency = 2; }
+ def OlympusWrite_2c_1M_1V03 : SchedWriteRes<[OlympusUnitV03, OlympusUnitM]> { let Latency = 2; }
+ def OlympusWrite_2c_1V0_1M : SchedWriteRes<[OlympusUnitV0, OlympusUnitM]> { let Latency = 2; }
+ def OlympusWrite_3c_1I_1M : SchedWriteRes<[OlympusUnitI, OlympusUnitM]> { let Latency = 3; }
+ def OlympusWrite_3c_1M_1M0 : SchedWriteRes<[OlympusUnitM, OlympusUnitM0]> { let Latency = 3; }
+ def OlympusWrite_3c_1M_1V03 : SchedWriteRes<[OlympusUnitV03, OlympusUnitM]> { let Latency = 3; }
+ def OlympusWrite_3c_1SA_1V0123 : SchedWriteRes<[OlympusUnitSA, OlympusUnitV0123]> { let Latency = 3; }
+ def OlympusWrite_3c_1V0_1M : SchedWriteRes<[OlympusUnitV0, OlympusUnitM]> { let Latency = 3; }
+ def OlympusWrite_3c_2V03 : SchedWriteRes<[OlympusUnitV03, OlympusUnitV03]> { let Latency = 3; }
+ def OlympusWrite_3c_2M : SchedWriteRes<[OlympusUnitM, OlympusUnitM]> { let Latency = 3; }
+ def OlympusWrite_4c_1L_1V : SchedWriteRes<[OlympusUnitL, OlympusUnitV]> { let Latency = 4; }
+ def OlympusWrite_4c_1M_1M0 : SchedWriteRes<[OlympusUnitM, OlympusUnitM0]> { let Latency = 4; }
+ def OlympusWrite_4c_1SA_1D : SchedWriteRes<[OlympusUnitSA, OlympusUnitD]> { let Latency = 4; }
+ def OlympusWrite_4c_2M : SchedWriteRes<[OlympusUnitM, OlympusUnitM]> { let Latency = 4; }
+ def OlympusWrite_4c_2V : SchedWriteRes<[OlympusUnitV, OlympusUnitV]> { let Latency = 4; }
+ def OlympusWrite_4c_2V0 : SchedWriteRes<[OlympusUnitV0, OlympusUnitV0]> { let Latency = 4; }
+ def OlympusWrite_5c_1B_1M0 : SchedWriteRes<[OlympusUnitB, OlympusUnitM0]> { let Latency = 5; }
+ def OlympusWrite_5c_1I_1L : SchedWriteRes<[OlympusUnitI, OlympusUnitL]> { let Latency = 5; }
+ def OlympusWrite_5c_1L_1F : SchedWriteRes<[OlympusUnitL, OlympusUnitF]> { let Latency = 5; }
+ def OlympusWrite_5c_1M0_1V : SchedWriteRes<[OlympusUnitM0, OlympusUnitV]> { let Latency = 5; }
+ def OlympusWrite_5c_1M_1L : SchedWriteRes<[OlympusUnitM, OlympusUnitL]> { let Latency = 5; }
+ def OlympusWrite_5c_1M_1V : SchedWriteRes<[OlympusUnitM, OlympusUnitV]> { let Latency = 5; }
+ def OlympusWrite_5c_2V : SchedWriteRes<[OlympusUnitV, OlympusUnitV]> { let Latency = 5; }
+ def OlympusWrite_5c_2V0 : SchedWriteRes<[OlympusUnitV0, OlympusUnitV0]> { let Latency = 5; }
+ def OlympusWrite_5c_1V_1V0123 : SchedWriteRes<[OlympusUnitV, OlympusUnitV0123]> { let Latency = 5; }
+ def OlympusWrite_6c_1I_1L : SchedWriteRes<[OlympusUnitI, OlympusUnitL]> { let Latency = 6; }
+ def OlympusWrite_6c_1L_1S : SchedWriteRes<[OlympusUnitL, OlympusUnitS]> { let Latency = 6; }
+ def OlympusWrite_6c_1V03_1V12 : SchedWriteRes<[OlympusUnitV03, OlympusUnitV12]> { let Latency = 6; }
+ def OlympusWrite_6c_1V1_1M0 : SchedWriteRes<[OlympusUnitV1, OlympusUnitM0]> { let Latency = 6; }
+ def OlympusWrite_6c_1V_1V0123 : SchedWriteRes<[OlympusUnitV, OlympusUnitV0123]> { let Latency = 6; }
+ def OlympusWrite_6c_2L : SchedWriteRes<[OlympusUnitL, OlympusUnitL]> { let Latency = 6; }
+ def OlympusWrite_6c_2V : SchedWriteRes<[OlympusUnitV, OlympusUnitV]> { let Latency = 6; }
+ def OlympusWrite_6c_2V12 : SchedWriteRes<[OlympusUnitV12, OlympusUnitV12]> { let Latency = 6; }
+ def OlympusWrite_6c_2V0123 : SchedWriteRes<[OlympusUnitV0123, OlympusUnitV0123]> { let Latency = 6; }
+ def OlympusWrite_7c_1F_1L : SchedWriteRes<[OlympusUnitF, OlympusUnitL]> { let Latency = 7; }
+ def OlympusWrite_7c_1I_1L : SchedWriteRes<[OlympusUnitI, OlympusUnitL]> { let Latency = 7; }
+ def OlympusWrite_7c_1M_1V0123 : SchedWriteRes<[OlympusUnitM, OlympusUnitV0123]> { let Latency = 7; }
+ def OlympusWrite_8c_1L_1V : SchedWriteRes<[OlympusUnitL, OlympusUnitV]> { let Latency = 8; }
+ def OlympusWrite_8c_1M0_1L : SchedWriteRes<[OlympusUnitM0, OlympusUnitL]> { let Latency = 8; }
+
+ // These types are multi-pumped.
----------------
rj-jesus wrote:
I added an explanation in lines 145-146 as it's the first time "multi-pumped" is mentioned - let me know if that works for you.
https://github.com/llvm/llvm-project/pull/171607
More information about the llvm-commits
mailing list