[llvm] [X86] AMD Zen 5 Scheduler Descriptions (PR #131780)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 18 03:52:24 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Ganesh (ganeshgit)
<details>
<summary>Changes</summary>
Scheduler descriptions for AMD Zen5 architecture.
---
Patch is 3.51 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131780.diff
69 Files Affected:
- (modified) llvm/lib/Target/X86/X86.td (+2-1)
- (added) llvm/lib/Target/X86/X86ScheduleZnver5.td (+2354)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/independent-load-stores.s (+160)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-adx.s (+75)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-aes.s (+91)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx1.s (+2451)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx2.s (+1101)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512.s (+2992)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bitalg.s (+100)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bitalgvl.s (+151)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bw.s (+1650)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bwvl.s (+2963)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512cd.s (+169)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512cdvl.s (+287)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512dq.s (+1282)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512dqvl.s (+1682)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512gfni.s (+124)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512gfnivl.s (+199)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512ifma.s (+105)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512ifmavl.s (+161)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vaes.s (+77)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vaesvl.s (+105)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi.s (+134)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi2.s (+413)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi2vl.s (+777)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmivl.s (+219)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vl.s (+4760)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vnni.s (+161)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vnnivl.s (+273)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vp2intersect.s (+53)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vp2intersectvl.s (+73)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpclmulqdq.s (+56)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpclmulqdqvl.s (+63)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpopcntdq.s (+109)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpopcntdqvl.s (+169)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avxgfni.s (+91)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avxvnni.s (+105)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-bmi1.s (+140)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-bmi2.s (+161)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-clflushopt.s (+53)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-clwb.s (+53)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-cmov.s (+343)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-cmpxchg.s (+62)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-f16c.s (+77)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-fma.s (+721)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-fsgsbase.s (+77)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-gfni.s (+70)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-lea.s (+457)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-lzcnt.s (+70)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-mmx.s (+413)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-movbe.s (+70)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-pclmul.s (+56)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-popcnt.s (+70)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-prefetchw.s (+56)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-rdrand.s (+59)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-rdseed.s (+59)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse1.s (+481)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse2.s (+980)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse3.s (+124)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse41.s (+386)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse42.s (+119)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-ssse3.s (+273)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-vaes.s (+77)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-vpclmulqdq.s (+56)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x86_32.s (+98)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x86_64.s (+2899)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x87.s (+541)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-xsave.s (+69)
- (added) llvm/test/tools/llvm-mca/X86/Znver5/zero-idioms.s (+801)
``````````diff
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 38761e1fd7eec..554ae4cb22abe 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -810,6 +810,7 @@ include "X86ScheduleZnver1.td"
include "X86ScheduleZnver2.td"
include "X86ScheduleZnver3.td"
include "X86ScheduleZnver4.td"
+include "X86ScheduleZnver5.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
@@ -1958,7 +1959,7 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
ProcessorFeatures.ZN3Tuning>;
def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
ProcessorFeatures.ZN4Tuning>;
-def : ProcModel<"znver5", Znver4Model, ProcessorFeatures.ZN5Features,
+def : ProcModel<"znver5", Znver5Model, ProcessorFeatures.ZN5Features,
ProcessorFeatures.ZN5Tuning>;
def : Proc<"geode", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver5.td b/llvm/lib/Target/X86/X86ScheduleZnver5.td
new file mode 100644
index 0000000000000..9222cbef175fa
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ScheduleZnver5.td
@@ -0,0 +1,2354 @@
+//=- X86ScheduleZnver5.td - X86 Znver5 Scheduling ------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver5 to support instruction
+// scheduling and other instruction cost heuristics.
+// Based on: (To Be Updated)
+// * Early information of hardware specification.
+//===----------------------------------------------------------------------===//
+
+def Znver5Model : SchedMachineModel {
+ // The processor may dispatch up to 8 macro ops per cycle
+ // into the execution engine.
+ let IssueWidth = 8;
+ // The retire control unit (RCU) tracks the completion status of all
+ // outstanding operations (integer, load/store, and floating-point) and is
+ // the final arbiter for exception processing and recovery.
+ // The unit can receive up to 6 macro ops dispatched per cycle and track up
+ // to 448 macro ops in-flight in non-SMT mode or 224 per thread in SMT mode.
+ let MicroOpBufferSize = 448;
+ // The op cache is organized as an associative cache with 64 sets and 8 ways.
+ // At each set-way intersection is an entry containing up to 8 macro ops.
+ // The maximum capacity of the op cache is 6.75K ops.
+ // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
+ // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop
+ // unrolling leading to excessive filling of the op-cache from frontend.
+ let LoopMicroOpBufferSize = 96;
+ // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve 4-cycle load-to-use integer load latency.
+ let LoadLatency = 4;
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
+ int VecLoadLatency = 7;
+ // Latency of a simple store operation.
+ int StoreLatency = 1;
+
+ let HighLatency = 25; // FIXME: any better choice?
+
+ // The branch misprediction penalty is in the range from 12 to 18 cycles,
+ // <...>. The common case penalty is 15 cycles.
+ let MispredictPenalty = 15;
+
+ let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+ let CompleteModel = 1;
+}
+
+let SchedModel = Znver5Model in {
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// The unit can receive up to 8 macro ops dispatched per cycle and track up to
+// 448 macro ops in-flight in non-SMT mode or 224 per thread in SMT mode. <...>
+// The retire unit handles in-order commit of up to eight macro ops per cycle.
+def Zn5RCU : RetireControlUnit<Znver5Model.MicroOpBufferSize, 8>;
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Unit
+//
+
+// The processor uses four decoupled independent integer scheduler queues,
+// each one servicing one ALU pipeline and one or two other pipelines
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// The processor contains 4 general purpose integer execution pipes.
+// Each pipe has an ALU capable of general purpose integer operations.
+def Zn5ALU0 : ProcResource<1>;
+def Zn5ALU1 : ProcResource<1>;
+def Zn5ALU2 : ProcResource<1>;
+def Zn5ALU3 : ProcResource<1>;
+def Zn5ALU4 : ProcResource<1>;
+def Zn5ALU5 : ProcResource<1>;
+
+// There is also a separate branch execution unit.
+def Zn5BRU1 : ProcResource<1>;
+
+// There are three Address Generation Units (AGUs) for all load and store
+// address generation. There are also 3 store data movement units
+// associated with the same schedulers as the AGUs.
+def Zn5AGU0 : ProcResource<1>;
+def Zn5AGU1 : ProcResource<1>;
+def Zn5AGU2 : ProcResource<1>;
+def Zn5AGU3 : ProcResource<1>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// ALU0 additionally has divide <...> execution capability.
+defvar Zn5Divider = Zn5ALU3;
+
+// ALU3,4,5 additionally have <...> branch execution capability.
+defvar Zn5BRU0 = Zn5ALU3;
+defvar Zn5BRU1 = Zn5ALU4;
+defvar Zn5BRU2 = Zn5ALU5;
+
+// Integer Multiplication issued on ALU1.
+//defvar Zn5Multiplier = Zn5ALU1;
+defvar Zn5MUL0 = Zn5ALU0;
+defvar Zn5MUL1 = Zn5ALU1;
+defvar Zn5MUL2 = Zn5ALU2;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// General ALU operations
+// Simple bit twiddling: bit test, shift/rotate, bit extraction
+// Zn5ALU0,1,2 can also handle CRC in addition to multiply
+def Zn5ALU012 : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2]>;
+
+// Zn5ALU3,4,5 handle complex bit twiddling: PDEP/PEXT
+def Zn5ALU345 : ProcResGroup<[Zn5ALU3, Zn5ALU4, Zn5ALU5]>;
+
+def Zn5ALU0_5 : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2, Zn5ALU3, Zn5ALU4, Zn5ALU5]>;
+
+// General AGU operations
+def Zn5AGU0123 : ProcResGroup<[Zn5AGU0, Zn5AGU1, Zn5AGU2, Zn5AGU3]>;
+
+// Multipliers
+def Zn5Multiplier : ProcResGroup<[Zn5MUL0, Zn5MUL1, Zn5MUL2]>;
+
+// Control flow: jumps, calls
+def Zn5BRU012 : ProcResGroup<[Zn5BRU0, Zn5BRU1, Zn5BRU2]>;
+
+// Everything that isn't control flow, but still needs to access CC register,
+// namely: conditional moves, SETcc.
+def Zn5ALU03 : ProcResGroup<[Zn5ALU0, Zn5ALU3]>;
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// The integer physical register file (PRF) consists of 240 registers.
+def Zn5IntegerPRF : RegisterFile<240, [GR64, CCR], [1, 1], [1, 0],
+ 8, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// The integer scheduler has a 136 entry macro op capacity.
+// The schedulers can receive up to eight macro ops per cycle, with a limit of
+// four per scheduler. Scheduler service ALU units and AGU unit separately.
+// ALU scheduler can issue three micro-ops per cycle into each of ALU pipeline
+// groups (ALU0-2) and (ALU3-5). AGU scheduler can issue four micro-ops per cycle
+// to its associated pipelines.
+
+def Zn5Int : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2, // scheduler 1
+ Zn5ALU3, Zn5ALU4, Zn5ALU5, // scheduler 2
+ Zn5AGU0, Zn5AGU1, Zn5AGU2, Zn5AGU3 // scheduler 3
+ ]> {
+ // There are two banks of the buffers with 136 macro op capacity.
+ let BufferSize = 136;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Unit
+//
+
+// The processor uses <...> two decoupled independent floating point schedulers
+// each servicing two FP pipelines and one store or FP-to-integer pipeline.
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// <...>, and six FPU pipes.
+// Agner, 22.10 Floating point execution pipes
+// There are six floating point/vector execution pipes,
+def Zn5FP0 : ProcResource<1>;
+def Zn5FP1 : ProcResource<1>;
+def Zn5FP2 : ProcResource<1>;
+def Zn5FP3 : ProcResource<1>;
+def Zn5FP45 : ProcResource<2>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+defvar Zn5FPFMul0 = Zn5FP0;
+defvar Zn5FPFMul1 = Zn5FP1;
+
+// (v)FADD*
+// Complex VADD operations are not available in all pipes. (VADDPD etc)
+defvar Zn5FPFAdd0 = Zn5FP2;
+defvar Zn5FPFAdd1 = Zn5FP3;
+
+// All convert operations except pack/unpack
+defvar Zn5FPFCvt0 = Zn5FP2;
+defvar Zn5FPFCvt1 = Zn5FP3;
+
+// All Divide and Square Root except Reciprocal Approximation
+// FDIV unit can support 2 simultaneous operations in flight
+// even though it occupies a single pipe.
+// FIXME: BufferSize=2 ?
+defvar Zn5FPFDiv = Zn5FP1;
+
+// Moves and Logical operations on Floating Point Data Types
+defvar Zn5FPFMisc0 = Zn5FP0;
+defvar Zn5FPFMisc1 = Zn5FP1;
+defvar Zn5FPFMisc2 = Zn5FP2;
+defvar Zn5FPFMisc3 = Zn5FP3;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+defvar Zn5FPVAdd0 = Zn5FP0;
+defvar Zn5FPVAdd1 = Zn5FP1;
+defvar Zn5FPVAdd2 = Zn5FP2;
+defvar Zn5FPVAdd3 = Zn5FP3;
+
+// Integer Multiplies, SAD, Blendvb
+defvar Zn5FPVMul0 = Zn5FP0;
+defvar Zn5FPVMul1 = Zn5FP1;
+defvar Zn5FPVMul2 = Zn5FP3;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+defvar Zn5FPVShuf = Zn5FP1;
+defvar Zn5FPVShufAux = Zn5FP2;
+
+// Bit Shift Left/Right operations
+defvar Zn5FPVShift0 = Zn5FP1;
+defvar Zn5FPVShift1 = Zn5FP2;
+
+// Moves and Logical operations on Packed Integer Data Types
+defvar Zn5FPVMisc0 = Zn5FP0;
+defvar Zn5FPVMisc1 = Zn5FP1;
+defvar Zn5FPVMisc2 = Zn5FP2;
+defvar Zn5FPVMisc3 = Zn5FP3;
+
+// *AES*
+defvar Zn5FPAES0 = Zn5FP0;
+defvar Zn5FPAES1 = Zn5FP1;
+
+// *CLM*
+defvar Zn5FPCLM0 = Zn5FP0;
+defvar Zn5FPCLM1 = Zn5FP1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// FLD use pipes 1 and 2.
+def Zn5FPU12 : ProcResGroup<[Zn5FP1, Zn5FP2]>;
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+def Zn5FPFMul01 : ProcResGroup<[Zn5FPFMul0, Zn5FPFMul1]>;
+
+// (v)FADD*
+// Some complex VADD operations are not available in all pipes.
+def Zn5FPFAdd01 : ProcResGroup<[Zn5FPFAdd0, Zn5FPFAdd1]>;
+
+// All convert operations except pack/unpack
+def Zn5FPFCvt01 : ProcResGroup<[Zn5FPFCvt0, Zn5FPFCvt1]>;
+
+// All Divide and Square Root except Reciprocal Approximation
+// def Zn5FPFDiv : ProcResGroup<[Zn5FPFDiv]>;
+
+// Moves and Logical operations on Floating Point Data Types
+def Zn5FPFMisc0123 : ProcResGroup<[Zn5FPFMisc0, Zn5FPFMisc1, Zn5FPFMisc2, Zn5FPFMisc3]>;
+
+// FIXUP and RANGE use FP01 pipelines
+def Zn5FPFMisc01 : ProcResGroup<[Zn5FPFMisc0, Zn5FPFMisc1]>;
+def Zn5FPFMisc12 : ProcResGroup<[Zn5FPFMisc1, Zn5FPFMisc2]>;
+// SCALE instructions use FP23 pipelines
+def Zn5FPFMisc23 : ProcResGroup<[Zn5FPFMisc2, Zn5FPFMisc3]>;
+def Zn5FPFMisc123 : ProcResGroup<[Zn5FPFMisc1,Zn5FPFMisc2, Zn5FPFMisc3]>;
+
+// Loads, Stores and Move to General Register (EX) Operations
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+defvar Zn5FPLd01 = Zn5FP45;
+
+// Note that FP stores are supported on two pipelines,
+// but throughput is limited to one per cycle.
+let Super = Zn5FP45 in
+def Zn5FPSt : ProcResource<1>;
+
+// Integer Adds, and Subtracts
+def Zn5FPVAdd0123 : ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd1, Zn5FPVAdd2, Zn5FPVAdd3]>;
+// Compares use first four pipes
+def Zn5FPU0123 : ProcResGroup<[Zn5FP0, Zn5FP1, Zn5FP2, Zn5FP3]>;
+
+def Zn5FPVAdd01: ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd1]>;
+def Zn5FPVAdd12: ProcResGroup<[Zn5FPVAdd1, Zn5FPVAdd2]>;
+def Zn5FPVAdd03: ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd3]>;
+
+// AVX512 Opmask pipelines
+def Zn5FPOpMask01: ProcResGroup<[Zn5FP0, Zn5FP3]>;
+def Zn5FPOpMask4: ProcResGroup<[Zn5FP45]>;
+
+// Integer Multiplies, SAD, Blendvb
+def Zn5FPVMul012 : ProcResGroup<[Zn5FPVMul0, Zn5FPVMul1, Zn5FPVMul2]>;
+
+// VNNIs execute in the first two pipes.
+def Zn5FPVMul01 : ProcResGroup<[Zn5FPVMul0, Zn5FPVMul1]>;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+def Zn5FPVShuf01 : ProcResGroup<[Zn5FPVShuf, Zn5FPVShufAux]>;
+
+// Bit Shift Left/Right operations
+def Zn5FPVShift01 : ProcResGroup<[Zn5FPVShift0, Zn5FPVShift1]>;
+
+// Moves and Logical operations on Packed Integer Data Types
+def Zn5FPVMisc0123 : ProcResGroup<[Zn5FPVMisc0, Zn5FPVMisc1, Zn5FPVMisc2, Zn5FPVMisc3]>;
+
+// *AES*
+def Zn5FPAES01 : ProcResGroup<[Zn5FPAES0, Zn5FPAES1]>;
+
+// *CLM*
+def Zn5FPCLM01 : ProcResGroup<[Zn5FPCLM0, Zn5FPCLM1]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// Agner, 21.8 Register renaming and out-of-order schedulers
+// The floating point register file has 384 vector registers
+// of 512b each in zen5.
+def Zn5FpPRF : RegisterFile<384, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
+ 6, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// The floating-point scheduler has a 3*38 entry macro op capacity.
+// <...> the scheduler can issue 1 micro op per cycle for each pipe.
+// FIXME: those are two separate schedulers, not a single big one.
+def Zn5FP : ProcResGroup<[Zn5FP0, Zn5FP2, /*Zn5FP4,*/ // scheduler 0
+ Zn5FP1, Zn5FP3, Zn5FP45 /*Zn5FP5*/ // scheduler 1
+ ]> {
+ let BufferSize = !mul(3, 38);
+}
+
+// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
+// even if floating-point scheduler is full.
+// FIXME: how to model this properly?
+
+
+//===----------------------------------------------------------------------===//
+// Load-Store Unit
+//
+
+// The LS unit contains four largely independent pipe-lines
+// enabling the execution of four memory operations per cycle.
+def Zn5LSU : ProcResource<4>;
+
+// All four memory operations can be loads.
+let Super = Zn5LSU in
+def Zn5Load : ProcResource<4> {
+ // The LS unit can process up to 128 out-of-order loads.
+ let BufferSize = 128;
+}
+
+def Zn5LoadQueue : LoadQueue<Zn5Load>;
+
+// A maximum of two of the memory operations can be stores.
+let Super = Zn5LSU in
+def Zn5Store : ProcResource<2> {
+ // The LS unit utilizes a 104-entry store queue (STQ).
+ let BufferSize = 104;
+}
+
+def Zn5StoreQueue : StoreQueue<Zn5Store>;
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+
+multiclass __Zn5WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
+ int Lat = 1, list<int> Res = [], int UOps = 1> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ReleaseAtCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
+multiclass __Zn5WriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat,
+ list<int> Res, int UOps, int LoadLat, int LoadUOps,
+ ProcResourceKind AGU, int LoadRes> {
+ defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+ defm : __Zn5WriteRes<SchedRW.Folded,
+ !listconcat([AGU, Zn5Load], ExePorts),
+ !add(Lat, LoadLat),
+ !if(!and(!empty(Res), !eq(LoadRes, 1)),
+ [],
+ !listconcat([1, LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
+ !add(UOps, LoadUOps)>;
+}
+
+// For classes without folded loads.
+multiclass Zn5WriteResInt<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResXMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResYMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResZMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+// For classes with folded loads.
+multiclass Zn5WriteResIntPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver5Model.LoadLatency,
+ LoadUOps, Zn5AGU0123, LoadRes>;
+}
+
+multiclass Zn5WriteResXMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver5Model.VecLoadLatency,
+ LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+multiclass Zn5WriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver5Model.VecLoadLatency,
+ LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+multiclass Zn5WriteResZMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 2,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver5Model.VecLoadLatency,
+ LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+def : ReadAdvance<ReadAfterLd, Znver5Model.LoadLatency>;
+
+def : ReadAdvance<ReadAfterVecLd, Znver5Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecXLd, Znver5Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecYLd, Znver5Model.VecLoadLatency>;
+
+// There is 1 cycle of added latency for a result to cross
+// from F to I or I to F domain.
+def : ReadAdvance<ReadInt2Fpu, -1>;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+defm : Zn5WriteResInt<WriteRMW, [Zn5...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/131780
More information about the llvm-commits
mailing list