[llvm] [X86] AMD Zen 5 Scheduler Descriptions (PR #131780)

Tue Mar 18 03:52:24 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-x86

Author: Ganesh (ganeshgit)

<details>
<summary>Changes</summary>

Scheduler descriptions for AMD Zen5 architecture.

---

Patch is 3.51 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131780.diff


69 Files Affected:

- (modified) llvm/lib/Target/X86/X86.td (+2-1) 
- (added) llvm/lib/Target/X86/X86ScheduleZnver5.td (+2354) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/independent-load-stores.s (+160) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-adx.s (+75) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-aes.s (+91) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx1.s (+2451) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx2.s (+1101) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512.s (+2992) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bitalg.s (+100) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bitalgvl.s (+151) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bw.s (+1650) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512bwvl.s (+2963) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512cd.s (+169) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512cdvl.s (+287) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512dq.s (+1282) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512dqvl.s (+1682) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512gfni.s (+124) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512gfnivl.s (+199) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512ifma.s (+105) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512ifmavl.s (+161) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vaes.s (+77) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vaesvl.s (+105) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi.s (+134) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi2.s (+413) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmi2vl.s (+777) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vbmivl.s (+219) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vl.s (+4760) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vnni.s (+161) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vnnivl.s (+273) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vp2intersect.s (+53) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vp2intersectvl.s (+73) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpclmulqdq.s (+56) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpclmulqdqvl.s (+63) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpopcntdq.s (+109) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avx512vpopcntdqvl.s (+169) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avxgfni.s (+91) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-avxvnni.s (+105) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-bmi1.s (+140) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-bmi2.s (+161) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-clflushopt.s (+53) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-clwb.s (+53) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-cmov.s (+343) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-cmpxchg.s (+62) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-f16c.s (+77) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-fma.s (+721) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-fsgsbase.s (+77) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-gfni.s (+70) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-lea.s (+457) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-lzcnt.s (+70) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-mmx.s (+413) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-movbe.s (+70) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-pclmul.s (+56) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-popcnt.s (+70) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-prefetchw.s (+56) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-rdrand.s (+59) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-rdseed.s (+59) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse1.s (+481) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse2.s (+980) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse3.s (+124) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse41.s (+386) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-sse42.s (+119) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-ssse3.s (+273) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-vaes.s (+77) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-vpclmulqdq.s (+56) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x86_32.s (+98) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x86_64.s (+2899) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-x87.s (+541) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/resources-xsave.s (+69) 
- (added) llvm/test/tools/llvm-mca/X86/Znver5/zero-idioms.s (+801) 


``````````diff

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 38761e1fd7eec..554ae4cb22abe 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -810,6 +810,7 @@ include "X86ScheduleZnver1.td"
 include "X86ScheduleZnver2.td"
 include "X86ScheduleZnver3.td"
 include "X86ScheduleZnver4.td"
+include "X86ScheduleZnver5.td"
 include "X86ScheduleBdVer2.td"
 include "X86ScheduleBtVer2.td"
 include "X86SchedSkylakeClient.td"
@@ -1958,7 +1959,7 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
                 ProcessorFeatures.ZN3Tuning>;
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
                 ProcessorFeatures.ZN4Tuning>;
-def : ProcModel<"znver5", Znver4Model, ProcessorFeatures.ZN5Features,
+def : ProcModel<"znver5", Znver5Model, ProcessorFeatures.ZN5Features,
                 ProcessorFeatures.ZN5Tuning>;
 
 def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver5.td b/llvm/lib/Target/X86/X86ScheduleZnver5.td
new file mode 100644
index 0000000000000..9222cbef175fa
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ScheduleZnver5.td
@@ -0,0 +1,2354 @@
+//=- X86ScheduleZnver5.td - X86 Znver5 Scheduling ------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver5 to support instruction
+// scheduling and other instruction cost heuristics.
+// Based on: (To Be Updated)
+//  * Early information of hardware specification.
+//===----------------------------------------------------------------------===//
+
+def Znver5Model : SchedMachineModel {
+  // The processor may dispatch up to 8 macro ops per cycle
+  // into the execution engine.
+  let IssueWidth = 8;
+  // The retire control unit (RCU) tracks the completion status of all
+  // outstanding operations (integer, load/store, and floating-point) and is
+  // the final arbiter for exception processing and recovery.
+  // The unit can receive up to 6 macro ops dispatched per cycle and track up
+  // to 448 macro ops in-flight in non-SMT mode or 224 per thread in SMT mode.
+  let MicroOpBufferSize = 448;
+  // The op cache is organized as an associative cache with 64 sets and 8 ways.
+  // At each set-way intersection is an entry containing up to 8 macro ops.
+  // The maximum capacity of the op cache is 6.75K ops.
+  // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
+  // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop
+  // unrolling leading to excessive filling of the op-cache from frontend.
+  let LoopMicroOpBufferSize = 96;
+  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
+  // The AGU and LS pipelines are optimized for simple address generation modes.
+  // <...> and can achieve 4-cycle load-to-use integer load latency.
+  let LoadLatency = 4;
+  // The AGU and LS pipelines are optimized for simple address generation modes.
+  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
+  int VecLoadLatency = 7;
+  // Latency of a simple store operation.
+  int StoreLatency = 1;
+
+  let HighLatency = 25; // FIXME: any better choice?
+
+  // The branch misprediction penalty is in the range from 12 to 18 cycles,
+  // <...>. The common case penalty is 15 cycles.
+  let MispredictPenalty = 15;
+
+  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+  let CompleteModel = 1;
+}
+
+let SchedModel = Znver5Model in {
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// The unit can receive up to 8 macro ops dispatched per cycle and track up to
+// 448 macro ops in-flight in non-SMT mode or 224 per thread in SMT mode. <...>
+// The retire unit handles in-order commit of up to eight macro ops per cycle.
+def Zn5RCU : RetireControlUnit<Znver5Model.MicroOpBufferSize, 8>;
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Unit
+//
+
+// The processor uses four decoupled independent integer scheduler queues,
+// each one servicing one ALU pipeline and one or two other pipelines
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// The processor contains 4 general purpose integer execution pipes.
+// Each pipe has an ALU capable of general purpose integer operations.
+def Zn5ALU0 : ProcResource<1>;
+def Zn5ALU1 : ProcResource<1>;
+def Zn5ALU2 : ProcResource<1>;
+def Zn5ALU3 : ProcResource<1>;
+def Zn5ALU4 : ProcResource<1>;
+def Zn5ALU5 : ProcResource<1>;
+
+// There is also a separate branch execution unit.
+def Zn5BRU1 : ProcResource<1>;
+
+// There are three Address Generation Units (AGUs) for all load and store
+// address generation. There are also 3 store data movement units
+// associated with the same schedulers as the AGUs.
+def Zn5AGU0 : ProcResource<1>;
+def Zn5AGU1 : ProcResource<1>;
+def Zn5AGU2 : ProcResource<1>;
+def Zn5AGU3 : ProcResource<1>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// ALU0 additionally has divide <...> execution capability.
+defvar Zn5Divider = Zn5ALU3;
+
+// ALU3,4,5 additionally have <...> branch execution capability.
+defvar Zn5BRU0 = Zn5ALU3;
+defvar Zn5BRU1 = Zn5ALU4;
+defvar Zn5BRU2 = Zn5ALU5;
+
+// Integer Multiplication issued on ALU1.
+//defvar Zn5Multiplier = Zn5ALU1;
+defvar Zn5MUL0 = Zn5ALU0;
+defvar Zn5MUL1 = Zn5ALU1;
+defvar Zn5MUL2 = Zn5ALU2;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// General ALU operations
+// Simple bit twiddling: bit test, shift/rotate, bit extraction
+// Zn5ALU0,1,2 can also handle CRC in addition to multiply
+def Zn5ALU012 : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2]>;
+
+// Zn5ALU3,4,5 handle complex bit twiddling: PDEP/PEXT
+def Zn5ALU345 : ProcResGroup<[Zn5ALU3, Zn5ALU4, Zn5ALU5]>;
+
+def Zn5ALU0_5 : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2, Zn5ALU3, Zn5ALU4, Zn5ALU5]>;
+
+// General AGU operations
+def Zn5AGU0123 : ProcResGroup<[Zn5AGU0, Zn5AGU1, Zn5AGU2, Zn5AGU3]>;
+
+// Multipliers
+def Zn5Multiplier : ProcResGroup<[Zn5MUL0, Zn5MUL1, Zn5MUL2]>;
+
+// Control flow: jumps, calls
+def Zn5BRU012 : ProcResGroup<[Zn5BRU0, Zn5BRU1, Zn5BRU2]>;
+
+// Everything that isn't control flow, but still needs to access CC register,
+// namely: conditional moves, SETcc.
+def Zn5ALU03 : ProcResGroup<[Zn5ALU0, Zn5ALU3]>;
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// The integer physical register file (PRF) consists of 240 registers.
+def Zn5IntegerPRF : RegisterFile<240, [GR64, CCR], [1, 1], [1, 0],
+                              8,  // Max moves that can be eliminated per cycle.
+                              0>; // Restrict move elimination to zero regs.
+
+// The integer scheduler has a 136 entry macro op capacity.
+// The schedulers can receive up to eight macro ops per cycle, with a limit of
+// four per scheduler. Scheduler service ALU units and AGU unit separately.
+// ALU scheduler can issue three micro-ops per cycle into each of ALU pipeline
+// groups (ALU0-2) and (ALU3-5). AGU scheduler can issue four micro-ops per cycle
+// to its associated pipelines.
+
+def Zn5Int : ProcResGroup<[Zn5ALU0, Zn5ALU1, Zn5ALU2,         // scheduler 1
+                           Zn5ALU3, Zn5ALU4, Zn5ALU5,         // scheduler 2
+                           Zn5AGU0, Zn5AGU1, Zn5AGU2, Zn5AGU3 // scheduler 3
+                          ]> {
+  // There are two banks of the buffers with 136 macro op capacity.
+  let BufferSize = 136;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Unit
+//
+
+// The processor uses <...> two decoupled independent floating point schedulers
+// each servicing two FP pipelines and one store or FP-to-integer pipeline.
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// <...>, and six FPU pipes.
+// Agner, 22.10 Floating point execution pipes
+// There are six floating point/vector execution pipes,
+def Zn5FP0  : ProcResource<1>;
+def Zn5FP1  : ProcResource<1>;
+def Zn5FP2  : ProcResource<1>;
+def Zn5FP3  : ProcResource<1>;
+def Zn5FP45 : ProcResource<2>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+defvar Zn5FPFMul0 = Zn5FP0;
+defvar Zn5FPFMul1 = Zn5FP1;
+
+// (v)FADD*
+// Complex VADD operations are not available in all pipes. (VADDPD etc)
+defvar Zn5FPFAdd0 = Zn5FP2;
+defvar Zn5FPFAdd1 = Zn5FP3;
+
+// All convert operations except pack/unpack
+defvar Zn5FPFCvt0 = Zn5FP2;
+defvar Zn5FPFCvt1 = Zn5FP3;
+
+// All Divide and Square Root except Reciprocal Approximation
+// FDIV unit can support 2 simultaneous operations in flight
+// even though it occupies a single pipe.
+// FIXME: BufferSize=2 ?
+defvar Zn5FPFDiv = Zn5FP1;
+
+// Moves and Logical operations on Floating Point Data Types
+defvar Zn5FPFMisc0 = Zn5FP0;
+defvar Zn5FPFMisc1 = Zn5FP1;
+defvar Zn5FPFMisc2 = Zn5FP2;
+defvar Zn5FPFMisc3 = Zn5FP3;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+defvar Zn5FPVAdd0 = Zn5FP0;
+defvar Zn5FPVAdd1 = Zn5FP1;
+defvar Zn5FPVAdd2 = Zn5FP2;
+defvar Zn5FPVAdd3 = Zn5FP3;
+
+// Integer Multiplies, SAD, Blendvb
+defvar Zn5FPVMul0 = Zn5FP0;
+defvar Zn5FPVMul1 = Zn5FP1;
+defvar Zn5FPVMul2 = Zn5FP3;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+defvar Zn5FPVShuf = Zn5FP1;
+defvar Zn5FPVShufAux = Zn5FP2;
+
+// Bit Shift Left/Right operations
+defvar Zn5FPVShift0 = Zn5FP1;
+defvar Zn5FPVShift1 = Zn5FP2;
+
+// Moves and Logical operations on Packed Integer Data Types
+defvar Zn5FPVMisc0 = Zn5FP0;
+defvar Zn5FPVMisc1 = Zn5FP1;
+defvar Zn5FPVMisc2 = Zn5FP2;
+defvar Zn5FPVMisc3 = Zn5FP3;
+
+// *AES*
+defvar Zn5FPAES0 = Zn5FP0;
+defvar Zn5FPAES1 = Zn5FP1;
+
+// *CLM*
+defvar Zn5FPCLM0 = Zn5FP0;
+defvar Zn5FPCLM1 = Zn5FP1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// FLD use pipes 1 and 2.
+def Zn5FPU12 : ProcResGroup<[Zn5FP1, Zn5FP2]>;
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+def Zn5FPFMul01 : ProcResGroup<[Zn5FPFMul0, Zn5FPFMul1]>;
+
+// (v)FADD*
+// Some complex VADD operations are not available in all pipes.
+def Zn5FPFAdd01 : ProcResGroup<[Zn5FPFAdd0, Zn5FPFAdd1]>;
+
+// All convert operations except pack/unpack
+def Zn5FPFCvt01 : ProcResGroup<[Zn5FPFCvt0, Zn5FPFCvt1]>;
+
+// All Divide and Square Root except Reciprocal Approximation
+// def Zn5FPFDiv : ProcResGroup<[Zn5FPFDiv]>;
+
+// Moves and Logical operations on Floating Point Data Types
+def Zn5FPFMisc0123 : ProcResGroup<[Zn5FPFMisc0, Zn5FPFMisc1, Zn5FPFMisc2, Zn5FPFMisc3]>;
+
+// FIXUP and RANGE use FP01 pipelines
+def Zn5FPFMisc01 : ProcResGroup<[Zn5FPFMisc0, Zn5FPFMisc1]>;
+def Zn5FPFMisc12 : ProcResGroup<[Zn5FPFMisc1, Zn5FPFMisc2]>;
+// SCALE instructions use FP23 pipelines
+def Zn5FPFMisc23 : ProcResGroup<[Zn5FPFMisc2, Zn5FPFMisc3]>;
+def Zn5FPFMisc123 : ProcResGroup<[Zn5FPFMisc1,Zn5FPFMisc2, Zn5FPFMisc3]>;
+
+// Loads, Stores and Move to General Register (EX) Operations
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+defvar Zn5FPLd01 = Zn5FP45;
+
+// Note that FP stores are supported on two pipelines,
+// but throughput is limited to one per cycle.
+let Super = Zn5FP45 in
+def Zn5FPSt : ProcResource<1>;
+
+// Integer Adds, and Subtracts
+def Zn5FPVAdd0123 : ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd1, Zn5FPVAdd2, Zn5FPVAdd3]>;
+// Compares use first four pipes
+def Zn5FPU0123 : ProcResGroup<[Zn5FP0, Zn5FP1, Zn5FP2, Zn5FP3]>;
+
+def Zn5FPVAdd01: ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd1]>;
+def Zn5FPVAdd12: ProcResGroup<[Zn5FPVAdd1, Zn5FPVAdd2]>;
+def Zn5FPVAdd03: ProcResGroup<[Zn5FPVAdd0, Zn5FPVAdd3]>;
+
+// AVX512 Opmask pipelines
+def Zn5FPOpMask01: ProcResGroup<[Zn5FP0, Zn5FP3]>;
+def Zn5FPOpMask4: ProcResGroup<[Zn5FP45]>;
+
+// Integer Multiplies, SAD, Blendvb
+def Zn5FPVMul012 : ProcResGroup<[Zn5FPVMul0, Zn5FPVMul1, Zn5FPVMul2]>;
+
+// VNNIs execute in the first two pipes.
+def Zn5FPVMul01 : ProcResGroup<[Zn5FPVMul0, Zn5FPVMul1]>;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+def Zn5FPVShuf01 : ProcResGroup<[Zn5FPVShuf, Zn5FPVShufAux]>;
+
+// Bit Shift Left/Right operations
+def Zn5FPVShift01 : ProcResGroup<[Zn5FPVShift0, Zn5FPVShift1]>;
+
+// Moves and Logical operations on Packed Integer Data Types
+def Zn5FPVMisc0123 : ProcResGroup<[Zn5FPVMisc0, Zn5FPVMisc1, Zn5FPVMisc2, Zn5FPVMisc3]>;
+
+// *AES*
+def Zn5FPAES01 : ProcResGroup<[Zn5FPAES0, Zn5FPAES1]>;
+
+// *CLM*
+def Zn5FPCLM01 : ProcResGroup<[Zn5FPCLM0, Zn5FPCLM1]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// Agner, 21.8 Register renaming and out-of-order schedulers
+// The floating point register file has 384 vector registers
+// of 512b each in zen5.
+def Zn5FpPRF : RegisterFile<384, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
+                            6,  // Max moves that can be eliminated per cycle.
+                            0>; // Restrict move elimination to zero regs.
+
+// The floating-point scheduler has a 3*38 entry macro op capacity.
+// <...> the scheduler can issue 1 micro op per cycle for each pipe.
+// FIXME: those are two separate schedulers, not a single big one.
+def Zn5FP : ProcResGroup<[Zn5FP0, Zn5FP2,          /*Zn5FP4,*/ // scheduler 0
+                          Zn5FP1, Zn5FP3, Zn5FP45 /*Zn5FP5*/  // scheduler 1
+                         ]> {
+  let BufferSize = !mul(3, 38);
+}
+
+// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
+// even if floating-point scheduler is full.
+// FIXME: how to model this properly?
+
+
+//===----------------------------------------------------------------------===//
+// Load-Store Unit
+//
+
+// The LS unit contains four largely independent pipe-lines
+// enabling the execution of four memory operations per cycle.
+def Zn5LSU : ProcResource<4>;
+
+// All four memory operations can be loads.
+let Super = Zn5LSU in
+def Zn5Load : ProcResource<4> {
+  // The LS unit can process up to 128 out-of-order loads.
+  let BufferSize = 128;
+}
+
+def Zn5LoadQueue : LoadQueue<Zn5Load>;
+
+// A maximum of two of the memory operations can be stores.
+let Super = Zn5LSU in
+def Zn5Store : ProcResource<2> {
+  // The LS unit utilizes a 104-entry store queue (STQ).
+  let BufferSize = 104;
+}
+
+def Zn5StoreQueue : StoreQueue<Zn5Store>;
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+
+multiclass __Zn5WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
+                         int Lat = 1, list<int> Res = [], int UOps = 1> {
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ReleaseAtCycles = Res;
+    let NumMicroOps = UOps;
+  }
+}
+
+multiclass __Zn5WriteResPair<X86FoldableSchedWrite SchedRW,
+                             list<ProcResourceKind> ExePorts, int Lat,
+                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
+                             ProcResourceKind AGU, int LoadRes> {
+  defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+  defm : __Zn5WriteRes<SchedRW.Folded,
+                       !listconcat([AGU, Zn5Load], ExePorts),
+                       !add(Lat, LoadLat),
+                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
+                         [],
+                         !listconcat([1, LoadRes],
+                           !if(!empty(Res),
+                             !listsplat(1, !size(ExePorts)),
+                             Res))),
+                       !add(UOps, LoadUOps)>;
+}
+
+// For classes without folded loads.
+multiclass Zn5WriteResInt<SchedWrite SchedRW,
+                          list<ProcResourceKind> ExePorts, int Lat = 1,
+                          list<int> Res = [], int UOps = 1> {
+  defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResXMM<SchedWrite SchedRW,
+                          list<ProcResourceKind> ExePorts, int Lat = 1,
+                          list<int> Res = [], int UOps = 1> {
+  defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResYMM<SchedWrite SchedRW,
+                          list<ProcResourceKind> ExePorts, int Lat = 1,
+                          list<int> Res = [], int UOps = 1> {
+  defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn5WriteResZMM<SchedWrite SchedRW,
+                          list<ProcResourceKind> ExePorts, int Lat = 1,
+                          list<int> Res = [], int UOps = 1> {
+  defm : __Zn5WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+// For classes with folded loads.
+multiclass Zn5WriteResIntPair<X86FoldableSchedWrite SchedRW,
+                              list<ProcResourceKind> ExePorts, int Lat = 1,
+                              list<int> Res = [], int UOps = 1,
+                              int LoadUOps = 0, int LoadRes = 1> {
+  defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           Znver5Model.LoadLatency,
+                           LoadUOps, Zn5AGU0123, LoadRes>;
+}
+
+multiclass Zn5WriteResXMMPair<X86FoldableSchedWrite SchedRW,
+                              list<ProcResourceKind> ExePorts, int Lat = 1,
+                              list<int> Res = [], int UOps = 1,
+                              int LoadUOps = 0, int LoadRes = 1> {
+  defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           Znver5Model.VecLoadLatency,
+                           LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+multiclass Zn5WriteResYMMPair<X86FoldableSchedWrite SchedRW,
+                              list<ProcResourceKind> ExePorts, int Lat = 1,
+                              list<int> Res = [], int UOps = 1,
+                              int LoadUOps = 0, int LoadRes = 1> {
+  defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           Znver5Model.VecLoadLatency,
+                           LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+multiclass Zn5WriteResZMMPair<X86FoldableSchedWrite SchedRW,
+                              list<ProcResourceKind> ExePorts, int Lat = 1,
+                              list<int> Res = [], int UOps = 2,
+                              int LoadUOps = 0, int LoadRes = 1> {
+  defm : __Zn5WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           Znver5Model.VecLoadLatency,
+                           LoadUOps, Zn5FPLd01, LoadRes>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+def : ReadAdvance<ReadAfterLd, Znver5Model.LoadLatency>;
+
+def : ReadAdvance<ReadAfterVecLd, Znver5Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecXLd, Znver5Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecYLd, Znver5Model.VecLoadLatency>;
+
+// There is 1 cycle of added latency for a result to cross
+// from F to I or I to F domain.
+def : ReadAdvance<ReadInt2Fpu, -1>;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+defm : Zn5WriteResInt<WriteRMW, [Zn5...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131780