[llvm] [X86] Fix some values for Znver4 model (PR #161405)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 12:22:31 PDT 2025
https://github.com/NexusXe updated https://github.com/llvm/llvm-project/pull/161405
>From a87cd3ed6d2e2c3f02ae8f31559116dc2aa9e84e Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:46:24 -0500
Subject: [PATCH 01/11] fix documentation reference
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc300548a50e6..384c7fc591490 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//
def Znver4Model : SchedMachineModel {
- // AMD SOG Zen4, 2.9.6 Dispatch
+ // AMD SOG Zen4, 2.9.8 Dispatch
// The processor may dispatch up to 6 macro ops per cycle
// into the execution engine.
let IssueWidth = 6;
>From d4dfe19914e13019863d3451e077505da0890fff Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:46:39 -0500
Subject: [PATCH 02/11] better HighLatency value
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 384c7fc591490..2a8e614d62512 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
- // FIXME:
- let HighLatency = 25; // FIXME: any better choice?
+ // Mean and median value for all instructions with latencies >6
+ // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+ let HighLatency = 13;
// AMD SOG Zen4, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
// <...>. The common case penalty is 13 cycles.
>From 78cf3aaa5806a1600b3960a7512f425f87d6aff8 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:08 -0500
Subject: [PATCH 03/11] LEA metrics
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 2a8e614d62512..55a4c4a1388b7 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -586,10 +586,11 @@ def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
// This write is used for slow LEA instructions.
+// values from uops.info
def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let Latency = 3;
+ let ReleaseAtCycles = [1, 1, 1, 2];
+ let NumMicroOps = 4;
}
// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
@@ -613,9 +614,10 @@ def Zn4WriteLEA : SchedWriteVariant<[
def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [4];
+ let Latency = 2;
+ let ReleaseAtCycles = [1, 1];
let NumMicroOps = 2;
}
>From 0fe425b861ba22f0c46f18e676234a5ca08de578 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:33 -0500
Subject: [PATCH 04/11] (CMP)XCHG uops
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 55a4c4a1388b7..8fec28cb34118 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -663,14 +663,14 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 3; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [24];
- let NumMicroOps = 19;
+ let NumMicroOps = 15;
}
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 4; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [59];
- let NumMicroOps = 28;
+ let NumMicroOps = 26;
}
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
@@ -684,7 +684,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = 5;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
>From c89b3ec25173b8cad6dcb0a874df89b72f132fe6 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:47:53 -0500
Subject: [PATCH 05/11] (I)DIV values from docs/uops.info
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 8fec28cb34118..3352a9beb516a 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -696,16 +696,14 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
// Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
>From 569e5dfaa38385af245633e31185a21b3d134ce3 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:48:13 -0500
Subject: [PATCH 06/11] use zen4 BSF/BSR uops count
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 3352a9beb516a..9aa31b31a992c 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -705,8 +705,8 @@ defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
>From 7a59f35470ead8cb48582e17f7ff259e1f0d5eac Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:49:10 -0500
Subject: [PATCH 07/11] TZCNT in 1 uop
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 9aa31b31a992c..fc454eb5ddee0 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -726,12 +726,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
}
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [4];
- let NumMicroOps = 2;
+ let Latency = 1;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
>From e7fbab91ab6f496631fcc61c57f50312f28ab967 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:49:52 -0500
Subject: [PATCH 08/11] some higher latency string instructions
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index fc454eb5ddee0..86793f4de1dd0 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1327,9 +1327,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
// Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
// Packed Compare Implicit Length Strings, Return Index
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
// Packed Compare Explicit Length Strings, Return Index
>From 9f5e84ed0ee08c9d92266a3aeaa1563a8951f180 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 11:55:18 -0500
Subject: [PATCH 09/11] use Zen4 CLMUL/VPERM(S/D) values
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 86793f4de1dd0..405c0a2e9fbd1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1341,7 +1341,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
// Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
// EMMS/FEMMS
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1387,23 +1387,23 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 7;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 6;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
@@ -1415,9 +1415,9 @@ def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 5;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
>From 8d31f9242e6b101163b4b491ce2e44e6ba507626 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 12:03:31 -0500
Subject: [PATCH 10/11] replace FIXME for latency
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 405c0a2e9fbd1..421c6eb6bf0b9 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -616,7 +616,7 @@ def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
+ let Latency = 2; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1];
let NumMicroOps = 2;
}
>From 9d59c2761062718fe9159a54a2085ea539d5c2c1 Mon Sep 17 00:00:00 2001
From: NexusXe <andastrike at gmail.com>
Date: Tue, 30 Sep 2025 14:22:16 -0500
Subject: [PATCH 11/11] revert multiop ReleaseAtCycles value
---
llvm/lib/Target/X86/X86ScheduleZnver4.td | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 421c6eb6bf0b9..9d6b57d51808a 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -617,7 +617,7 @@ def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 2; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [1, 1];
+ let ReleaseAtCycles = [4];
let NumMicroOps = 2;
}
More information about the llvm-commits
mailing list