[llvm] [X86] Fix some values for Znver4 model (PR #161405)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 10:07:39 PDT 2025
https://github.com/NexusXe created https://github.com/llvm/llvm-project/pull/161405
This PR fixes a handful of latency and uop changes between Znver3 and Znver4 that were otherwise copied from Znver3.
Latency and uop values listed that matched Zen3 on uops.info were updated to those for Zen4.
>From a87cd3ed6d2e2c3f02ae8f31559116dc2aa9e84e Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:46:24 -0500
Subject: [PATCH 1/9] fix documentation reference
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc300548a50e6..384c7fc591490 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//
def Znver4Model : SchedMachineModel {
- // AMD SOG Zen4, 2.9.6 Dispatch
+ // AMD SOG Zen4, 2.9.8 Dispatch
// The processor may dispatch up to 6 macro ops per cycle
// into the execution engine.
let IssueWidth = 6;
>From d4dfe19914e13019863d3451e077505da0890fff Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:46:39 -0500
Subject: [PATCH 2/9] better HighLatency value
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 384c7fc591490..2a8e614d62512 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
- // FIXME:
- let HighLatency = 25; // FIXME: any better choice?
+ // Mean and median value for all instructions with latencies >6
+ // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+ let HighLatency = 13;
// AMD SOG Zen4, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
// <...>. The common case penalty is 13 cycles.
>From 78cf3aaa5806a1600b3960a7512f425f87d6aff8 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:08 -0500
Subject: [PATCH 3/9] LEA metrics
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 2a8e614d62512..55a4c4a1388b7 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -586,10 +586,11 @@ def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
// This write is used for slow LEA instructions.
+// values from uops.info
def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let Latency = 3;
+ let ReleaseAtCycles = [1, 1, 1, 2];
+ let NumMicroOps = 4;
}
// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
@@ -613,9 +614,10 @@ def Zn4WriteLEA : SchedWriteVariant<[
def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [4];
+ let Latency = 2;
+ let ReleaseAtCycles = [1, 1];
let NumMicroOps = 2;
}
>From 0fe425b861ba22f0c46f18e676234a5ca08de578 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:33 -0500
Subject: [PATCH 4/9] (CMP)XCHG uops
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 55a4c4a1388b7..8fec28cb34118 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -663,14 +663,14 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 3; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [24];
- let NumMicroOps = 19;
+ let NumMicroOps = 15;
}
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 4; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [59];
- let NumMicroOps = 28;
+ let NumMicroOps = 26;
}
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
@@ -684,7 +684,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = 5;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
>From c89b3ec25173b8cad6dcb0a874df89b72f132fe6 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:53 -0500
Subject: [PATCH 5/9] (I)DIV values from docs/uops.info
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 8fec28cb34118..3352a9beb516a 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -696,16 +696,14 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
// Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
>From 569e5dfaa38385af245633e31185a21b3d134ce3 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:48:13 -0500
Subject: [PATCH 6/9] use zen4 BSF/BSR uops count
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 3352a9beb516a..9aa31b31a992c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -705,8 +705,8 @@ defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
>From 7a59f35470ead8cb48582e17f7ff259e1f0d5eac Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:49:10 -0500
Subject: [PATCH 7/9] TZCNT in 1 uop
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 9aa31b31a992c..fc454eb5ddee0 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -726,12 +726,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
}
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [4];
- let NumMicroOps = 2;
+ let Latency = 1;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
>From e7fbab91ab6f496631fcc61c57f50312f28ab967 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:49:52 -0500
Subject: [PATCH 8/9] some higher latency string instructions
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index fc454eb5ddee0..86793f4de1dd0 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1327,9 +1327,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
// Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
// Packed Compare Implicit Length Strings, Return Index
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
// Packed Compare Explicit Length Strings, Return Index
>From 9f5e84ed0ee08c9d92266a3aeaa1563a8951f180 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:55:18 -0500
Subject: [PATCH 9/9] use Zen4 CLMUL/VPERM(S/D) values
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 86793f4de1dd0..405c0a2e9fbd1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1341,7 +1341,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
// Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
// EMMS/FEMMS
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1387,23 +1387,23 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 7;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 6;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
@@ -1415,9 +1415,9 @@ def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 5;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
More information about the llvm-commits
mailing list