[llvm] 2005ae1 - [X86][SLM] WriteVecIMul instructions only take 1uop (REAPPLIED)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 4 07:10:03 PDT 2021
Author: Simon Pilgrim
Date: 2021-09-04T15:03:56+01:00
New Revision: 2005ae15a66dd5d8a9845f3652192a70bd36d921
URL: https://github.com/llvm/llvm-project/commit/2005ae15a66dd5d8a9845f3652192a70bd36d921
DIFF: https://github.com/llvm/llvm-project/commit/2005ae15a66dd5d8a9845f3652192a70bd36d921.diff
LOG: [X86][SLM] WriteVecIMul instructions only take 1uop (REAPPLIED)
The xmm variant have half the throughput (and +1cy latency) of the mmx variants, but are still 1uop.
I still need to do more thorough testing of SLM on test-suite before fixing the obvious bad numbers for WritePMULLD.
But this helps the D103695 helper script get to more accurate numbers for vXi32 multiplies of extended operands (i.e. we can use PMADDWD, PMULLW/PMULHW etc). Matches what Intel AoM / Agner / llvm-exegesis reports.
Added:
Modified:
llvm/lib/Target/X86/X86ScheduleSLM.td
llvm/test/CodeGen/X86/slow-pmulld.ll
llvm/test/tools/llvm-mca/X86/SLM/resources-sse2.s
llvm/test/tools/llvm-mca/X86/SLM/resources-sse41.s
llvm/test/tools/llvm-mca/X86/SLM/resources-ssse3.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index a545f3cecb7c2..2bcb33e6b0bca 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -372,8 +372,8 @@ defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>;
defm : X86WriteResPairUnsupported<WriteVecALUZ>;
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
-defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2], 2>;
-defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2], 2>;
+defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 5, [2]>;
+defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 5, [2]>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
// FIXME: The below is closer to correct, but caused some perf regressions.
//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index aeccae300eea5..9a10a2353e16a 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -537,40 +537,40 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
; SLM32-LABEL: test_mul_v16i32_v16i16:
; SLM32: # %bb.0:
-; SLM32-NEXT: movdqa %xmm0, %xmm4
-; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM32-NEXT: movdqa %xmm1, %xmm3
-; SLM32-NEXT: movdqa %xmm4, %xmm2
-; SLM32-NEXT: pmullw %xmm0, %xmm4
+; SLM32-NEXT: movdqa %xmm0, %xmm1
+; SLM32-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM32-NEXT: movdqa %xmm1, %xmm2
+; SLM32-NEXT: movdqa %xmm3, %xmm4
+; SLM32-NEXT: pmullw %xmm0, %xmm1
; SLM32-NEXT: pmulhuw %xmm0, %xmm2
; SLM32-NEXT: pmullw %xmm0, %xmm3
-; SLM32-NEXT: pmulhuw %xmm0, %xmm1
-; SLM32-NEXT: movdqa %xmm4, %xmm0
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLM32-NEXT: pmulhuw %xmm0, %xmm4
+; SLM32-NEXT: movdqa %xmm1, %xmm0
+; SLM32-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM32-NEXT: movdqa %xmm3, %xmm2
-; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SLM32-NEXT: movdqa %xmm4, %xmm1
+; SLM32-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLM32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SLM32-NEXT: retl
;
; SLM64-LABEL: test_mul_v16i32_v16i16:
; SLM64: # %bb.0:
-; SLM64-NEXT: movdqa %xmm0, %xmm4
-; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
; SLM64-NEXT: movdqa %xmm1, %xmm3
-; SLM64-NEXT: movdqa %xmm4, %xmm2
-; SLM64-NEXT: pmullw %xmm0, %xmm4
+; SLM64-NEXT: movdqa %xmm0, %xmm1
+; SLM64-NEXT: movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
+; SLM64-NEXT: movdqa %xmm1, %xmm2
+; SLM64-NEXT: movdqa %xmm3, %xmm4
+; SLM64-NEXT: pmullw %xmm0, %xmm1
; SLM64-NEXT: pmulhuw %xmm0, %xmm2
; SLM64-NEXT: pmullw %xmm0, %xmm3
-; SLM64-NEXT: pmulhuw %xmm0, %xmm1
-; SLM64-NEXT: movdqa %xmm4, %xmm0
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SLM64-NEXT: pmulhuw %xmm0, %xmm4
+; SLM64-NEXT: movdqa %xmm1, %xmm0
+; SLM64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SLM64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SLM64-NEXT: movdqa %xmm3, %xmm2
-; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SLM64-NEXT: movdqa %xmm4, %xmm1
+; SLM64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SLM64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SLM64-NEXT: retq
;
; SLOW32-LABEL: test_mul_v16i32_v16i16:
diff --git a/llvm/test/tools/llvm-mca/X86/SLM/resources-sse2.s b/llvm/test/tools/llvm-mca/X86/SLM/resources-sse2.s
index 8c5fff166cab4..dfdfa1320a2a5 100644
--- a/llvm/test/tools/llvm-mca/X86/SLM/resources-sse2.s
+++ b/llvm/test/tools/llvm-mca/X86/SLM/resources-sse2.s
@@ -563,8 +563,8 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pextrw $1, %xmm0, %ecx
# CHECK-NEXT: 1 1 1.00 pinsrw $1, %eax, %xmm0
# CHECK-NEXT: 1 4 1.00 * pinsrw $1, (%rax), %xmm0
-# CHECK-NEXT: 2 5 2.00 pmaddwd %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmaddwd (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmaddwd %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmaddwd (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pmaxsw %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pmaxsw (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pmaxub %xmm0, %xmm2
@@ -574,16 +574,16 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 pminub %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pminub (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmovmskb %xmm0, %ecx
-# CHECK-NEXT: 2 5 2.00 pmulhuw %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmulhuw (%rax), %xmm2
-# CHECK-NEXT: 2 5 2.00 pmulhw %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmulhw (%rax), %xmm2
-# CHECK-NEXT: 2 5 2.00 pmullw %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmullw (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmulhuw %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmulhuw (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmulhw %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmulhw (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmullw %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmullw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmuludq %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmuludq (%rax), %mm2
-# CHECK-NEXT: 2 5 2.00 pmuludq %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmuludq (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmuludq %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmuludq (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 por %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * por (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 psadbw %xmm0, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/SLM/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/SLM/resources-sse41.s
index 2bcebead6181a..e0e19e681853b 100644
--- a/llvm/test/tools/llvm-mca/X86/SLM/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/SLM/resources-sse41.s
@@ -237,8 +237,8 @@ roundss $1, (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 * pmovzxwd (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pmovzxwq %xmm0, %xmm2
# CHECK-NEXT: 1 4 1.00 * pmovzxwq (%rax), %xmm2
-# CHECK-NEXT: 2 5 2.00 pmuldq %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmuldq (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmuldq %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmuldq (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmulld %xmm0, %xmm2
# CHECK-NEXT: 1 7 1.00 * pmulld (%rax), %xmm2
# CHECK-NEXT: 1 1 0.50 ptest %xmm0, %xmm1
diff --git a/llvm/test/tools/llvm-mca/X86/SLM/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/SLM/resources-ssse3.s
index f6c1bfe3bae8d..3fb48787d929f 100644
--- a/llvm/test/tools/llvm-mca/X86/SLM/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/SLM/resources-ssse3.s
@@ -148,12 +148,12 @@ psignw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 * phsubw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmaddubsw %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmaddubsw (%rax), %mm2
-# CHECK-NEXT: 2 5 2.00 pmaddubsw %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmaddubsw (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmaddubsw %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmaddubsw (%rax), %xmm2
# CHECK-NEXT: 1 4 1.00 pmulhrsw %mm0, %mm2
# CHECK-NEXT: 1 7 1.00 * pmulhrsw (%rax), %mm2
-# CHECK-NEXT: 2 5 2.00 pmulhrsw %xmm0, %xmm2
-# CHECK-NEXT: 2 8 2.00 * pmulhrsw (%rax), %xmm2
+# CHECK-NEXT: 1 5 2.00 pmulhrsw %xmm0, %xmm2
+# CHECK-NEXT: 1 8 2.00 * pmulhrsw (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 pshufb %mm0, %mm2
# CHECK-NEXT: 1 4 1.00 * pshufb (%rax), %mm2
# CHECK-NEXT: 4 5 5.00 pshufb %xmm0, %xmm2
More information about the llvm-commits
mailing list