[llvm] 0f04936 - [X86] AMD Zen 3: MULX produces low part of the result in 3cy, +1cy for high part
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 27 03:27:42 PDT 2021
Author: Roman Lebedev
Date: 2021-08-27T13:27:05+03:00
New Revision: 0f04936a2d4e3ec7db681547876f7669c445af0e
URL: https://github.com/llvm/llvm-project/commit/0f04936a2d4e3ec7db681547876f7669c445af0e
DIFF: https://github.com/llvm/llvm-project/commit/0f04936a2d4e3ec7db681547876f7669c445af0e.diff
LOG: [X86] AMD Zen 3: MULX produces low part of the result in 3cy, +1cy for high part
As per llvm-exegesis measurements.
Added:
Modified:
llvm/lib/Target/X86/X86ScheduleZnver3.td
llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
llvm/test/tools/llvm-mca/X86/cv_fpo_directive_no_segfault.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index c2be9ec6085d1..86f1b285fec21 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -617,19 +617,12 @@ defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/
defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
-defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 4, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
-
-def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
- let Latency = 4;
- let ResourceCycles = [1];
- let NumMicroOps = 2;
-}
-def : InstRW<[Zn3MULX32rr, WriteIMulH], (instrs MULX32rr)>;
+defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3MULX32rr.Latency);
+ let Latency = !add(Znver3Model.LoadLatency, 3);
let ResourceCycles = [1, 1, 2];
- let NumMicroOps = Zn3MULX32rr.NumMicroOps;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn3MULX32rm, WriteIMulHLd,
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -638,19 +631,12 @@ def : InstRW<[Zn3MULX32rm, WriteIMulHLd,
defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
-defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 4, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
-
-def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
- let Latency = 4;
- let ResourceCycles = [1];
- let NumMicroOps = 2;
-}
-def : InstRW<[Zn3MULX64rr, WriteIMulH], (instrs MULX64rr)>;
+defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3MULX64rr.Latency);
+ let Latency = !add(Znver3Model.LoadLatency, 3);
let ResourceCycles = [1, 1, 2];
- let NumMicroOps = Zn3MULX64rr.NumMicroOps;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn3MULX64rm, WriteIMulHLd,
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
index 0db28c9708dfc..93f8d7616cb9c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-lo-reg-use.s
@@ -13,12 +13,12 @@ mulxq %rax, %rax, %rcx
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 11
+# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.18
+# CHECK-NEXT: uOps Per Cycle: 0.40
+# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@@ -66,11 +66,10 @@ mulxq %rax, %rax, %rcx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - mulxl %eax, %eax, %ecx
# CHECK: Timeline view:
-# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %ecx
-# CHECK-NEXT: [1,0] D====eeeeER mulxl %eax, %eax, %ecx
+# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %ecx
+# CHECK-NEXT: [1,0] D===eeeeER mulxl %eax, %eax, %ecx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -79,18 +78,18 @@ mulxq %rax, %rax, %rcx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxl %eax, %eax, %ecx
+# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %ecx
# CHECK: [1] Code Region
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 11
+# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.18
+# CHECK-NEXT: uOps Per Cycle: 0.40
+# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@@ -138,11 +137,10 @@ mulxq %rax, %rax, %rcx
# CHECK-NEXT: - - - - 1.00 - - - - - - - - - - - - - - - - - - mulxq %rax, %rax, %rcx
# CHECK: Timeline view:
-# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rcx
-# CHECK-NEXT: [1,0] D====eeeeER mulxq %rax, %rax, %rcx
+# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rcx
+# CHECK-NEXT: [1,0] D===eeeeER mulxq %rax, %rax, %rcx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -151,4 +149,4 @@ mulxq %rax, %rax, %rcx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq %rax, %rax, %rcx
+# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rcx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
index d7858ecf8ee1f..bbc5cfa398708 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-read-advance.s
@@ -15,12 +15,12 @@ mulxq (%rdi), %rax, %rdx
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 15
+# CHECK-NEXT: Total Cycles: 14
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.13
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.14
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -68,11 +68,11 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxl (%rdi), %eax, %edx
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . mulxl (%rdi), %eax, %edx
-# CHECK-NEXT: [1,0] D====eeeeeeeeER mulxl (%rdi), %eax, %edx
+# CHECK: [0,0] DeeeeeeeeER . mulxl (%rdi), %eax, %edx
+# CHECK-NEXT: [1,0] D===eeeeeeeeER mulxl (%rdi), %eax, %edx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -81,18 +81,18 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxl (%rdi), %eax, %edx
+# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl (%rdi), %eax, %edx
# CHECK: [1] Code Region
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
-# CHECK-NEXT: Total Cycles: 15
+# CHECK-NEXT: Total Cycles: 14
# CHECK-NEXT: Total uOps: 4
# CHECK: Dispatch Width: 6
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.13
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.14
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
@@ -140,11 +140,11 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - - - - - 0.50 0.50 - 0.50 0.50 - - mulxq (%rdi), %rax, %rdx
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . mulxq (%rdi), %rax, %rdx
-# CHECK-NEXT: [1,0] D====eeeeeeeeER mulxq (%rdi), %rax, %rdx
+# CHECK: [0,0] DeeeeeeeeER . mulxq (%rdi), %rax, %rdx
+# CHECK-NEXT: [1,0] D===eeeeeeeeER mulxq (%rdi), %rax, %rdx
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -153,4 +153,4 @@ mulxq (%rdi), %rax, %rdx
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq (%rdi), %rax, %rdx
+# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq (%rdi), %rax, %rdx
diff --git a/llvm/test/tools/llvm-mca/X86/cv_fpo_directive_no_segfault.s b/llvm/test/tools/llvm-mca/X86/cv_fpo_directive_no_segfault.s
index d2f64acf97dfe..efb4753f2503a 100644
--- a/llvm/test/tools/llvm-mca/X86/cv_fpo_directive_no_segfault.s
+++ b/llvm/test/tools/llvm-mca/X86/cv_fpo_directive_no_segfault.s
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic -resource-pressure=false -instruction-info=false < %s | FileCheck %s
.cv_fpo_pushreg ebx
@@ -7,3 +8,11 @@ add %ecx, %ecx
add %edx, %edx
# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 137
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 4
+# CHECK-NEXT: uOps Per Cycle: 2.92
+# CHECK-NEXT: IPC: 2.92
+# CHECK-NEXT: Block RThroughput: 1.3
More information about the llvm-commits
mailing list