[llvm] eea6a27 - [X86] WriteFShuffle256 shuffles aren't microcoded in the llvm sense
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 29 04:03:52 PDT 2022
Author: Simon Pilgrim
Date: 2022-10-29T12:03:43+01:00
New Revision: eea6a2782e852ee38a56af8245a27d864b56b592
URL: https://github.com/llvm/llvm-project/commit/eea6a2782e852ee38a56af8245a27d864b56b592
DIFF: https://github.com/llvm/llvm-project/commit/eea6a2782e852ee38a56af8245a27d864b56b592.diff
LOG: [X86] WriteFShuffle256 shuffles aren't microcoded in the llvm sense
znver1/2 might have poor throughput for crosslane shuffles but they don't consume 100 cycles of resources
I think there was a misunderstanding between the AMD definition of microcoding (more than 2-3 uops) and LLVM (here be dragons - impossible to approximately model the instruction)
This is more yak shaving to come from D103695 - this time working out why codegen involving broadcasts gives such weird numbers
Added:
Modified:
llvm/lib/Target/X86/X86ScheduleZnver1.td
llvm/lib/Target/X86/X86ScheduleZnver2.td
llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 79beb3eacf9af..01deab36d930b 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -372,6 +372,8 @@ defm : ZnWriteResFpuPair<WriteFSqrt64X, [ZnFPU3], 20, [8]>;
defm : ZnWriteResFpuPair<WriteFSqrt64Y, [ZnFPU3], 20, [16], 1>;
defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
defm : ZnWriteResFpuPair<WriteFSqrt80, [ZnFPU3], 20, [20]>;
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 2>;
+defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 2>;
// Vector integer operations which uses FPU units
defm : X86WriteRes<WriteVecLoad, [ZnAGU], 8, [1], 1>;
@@ -479,11 +481,6 @@ defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
-// Following instructions with latency=100 are microcoded.
-// We set long latency so as to block the entire pipeline.
-defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
-defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 100>;
-
// Microcoded Instructions
def ZnWriteMicrocoded : SchedWriteRes<[]> {
let Latency = 100;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index b4f72a968959b..788b71e6beab8 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -371,6 +371,8 @@ defm : Zn2WriteResFpuPair<WriteFSqrt64X, [Zn2FPU3], 20, [10]>;
defm : Zn2WriteResFpuPair<WriteFSqrt64Y, [Zn2FPU3], 20, [10]>;
defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
defm : Zn2WriteResFpuPair<WriteFSqrt80, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 2>;
// Vector integer operations which uses FPU units
defm : X86WriteRes<WriteVecLoad, [Zn2AGU], 8, [1], 1>;
@@ -478,11 +480,6 @@ defm : Zn2WriteResFpuPair<WriteAESKeyGen, [Zn2FPU01], 4>;
def : WriteRes<WriteFence, [Zn2AGU]>;
def : WriteRes<WriteNop, []>;
-// Following instructions with latency=100 are microcoded.
-// We set long latency so as to block the entire pipeline.
-defm : Zn2WriteResFpuPair<WriteFShuffle256, [Zn2FPU], 100>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
-
// Microcoded Instructions
def Zn2WriteMicrocoded : SchedWriteRes<[]> {
let Latency = 100;
diff --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
index 0648c3193386e..179e6a4921f5a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
@@ -461,8 +461,8 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 8 0.50 * vbroadcasti128 (%rax), %ymm0
-# CHECK-NEXT: 1 100 0.25 vbroadcastsd %xmm0, %ymm0
-# CHECK-NEXT: 1 100 0.25 vbroadcastss %xmm0, %ymm0
+# CHECK-NEXT: 1 2 0.25 vbroadcastsd %xmm0, %ymm0
+# CHECK-NEXT: 1 2 0.25 vbroadcastss %xmm0, %ymm0
# CHECK-NEXT: 1 2 0.25 vextracti128 $1, %ymm0, %xmm2
# CHECK-NEXT: 1 1 0.50 * vextracti128 $1, %ymm0, (%rax)
# CHECK-NEXT: 1 100 0.25 * vgatherdpd %xmm0, (%rax,%xmm1,2), %xmm2
@@ -562,10 +562,10 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 9 0.50 * vperm2i128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 2 0.25 vpermd %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 9 0.50 * vpermd (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 100 0.25 vpermpd $1, %ymm0, %ymm2
-# CHECK-NEXT: 1 107 0.50 * vpermpd $1, (%rax), %ymm2
-# CHECK-NEXT: 1 100 0.25 vpermps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1 107 0.50 * vpermps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.25 vpermpd $1, %ymm0, %ymm2
+# CHECK-NEXT: 1 9 0.50 * vpermpd $1, (%rax), %ymm2
+# CHECK-NEXT: 1 2 0.25 vpermps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 9 0.50 * vpermps (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 2 0.25 vpermq $1, %ymm0, %ymm2
# CHECK-NEXT: 1 9 0.50 * vpermq $1, (%rax), %ymm2
# CHECK-NEXT: 1 100 0.25 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
index 93eb00e7e9b07..e77f2f7eae055 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
@@ -461,8 +461,8 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 8 0.33 * vbroadcasti128 (%rax), %ymm0
-# CHECK-NEXT: 1 100 0.25 vbroadcastsd %xmm0, %ymm0
-# CHECK-NEXT: 1 100 0.25 vbroadcastss %xmm0, %ymm0
+# CHECK-NEXT: 1 2 0.25 vbroadcastsd %xmm0, %ymm0
+# CHECK-NEXT: 1 2 0.25 vbroadcastss %xmm0, %ymm0
# CHECK-NEXT: 1 2 0.25 vextracti128 $1, %ymm0, %xmm2
# CHECK-NEXT: 1 1 0.33 * vextracti128 $1, %ymm0, (%rax)
# CHECK-NEXT: 1 100 0.25 * vgatherdpd %xmm0, (%rax,%xmm1,2), %xmm2
@@ -562,10 +562,10 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 9 0.33 * vperm2i128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 2 0.25 vpermd %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 9 0.33 * vpermd (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 100 0.25 vpermpd $1, %ymm0, %ymm2
-# CHECK-NEXT: 1 107 0.33 * vpermpd $1, (%rax), %ymm2
-# CHECK-NEXT: 1 100 0.25 vpermps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 1 107 0.33 * vpermps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.25 vpermpd $1, %ymm0, %ymm2
+# CHECK-NEXT: 1 9 0.33 * vpermpd $1, (%rax), %ymm2
+# CHECK-NEXT: 1 2 0.25 vpermps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 9 0.33 * vpermps (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 2 0.25 vpermq $1, %ymm0, %ymm2
# CHECK-NEXT: 1 9 0.33 * vpermq $1, (%rax), %ymm2
# CHECK-NEXT: 1 100 0.25 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2
More information about the llvm-commits
mailing list