[llvm] faff990 - [X86] Fix Icelake VPMULLQ zmm pipes and adjust AVX512DQ v8i64 mul costs to match worse case

Sun Sep 25 06:18:25 PDT 2022

Author: Simon Pilgrim
Date: 2022-09-25T14:18:08+01:00
New Revision: faff990e9bf41cdf7b1bbc97b5b9daf6c95867d1

URL: https://github.com/llvm/llvm-project/commit/faff990e9bf41cdf7b1bbc97b5b9daf6c95867d1
DIFF: https://github.com/llvm/llvm-project/commit/faff990e9bf41cdf7b1bbc97b5b9daf6c95867d1.diff

LOG: [X86] Fix Icelake VPMULLQ zmm pipes and adjust AVX512DQ v8i64 mul costs to match worse case

Icelake PMULLQ throughput regressed cf SkylakeServer as its Pipe0 only

Confirmed with Intel SOM, Agner and instlatx64

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86SchedIceLake.td
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/arith-fix.ll
    llvm/test/Analysis/CostModel/X86/arith-int.ll
    llvm/test/Analysis/CostModel/X86/arith-overflow.ll
    llvm/test/Analysis/CostModel/X86/reduce-mul.ll
    llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 93d1eb087301e..354feb79da840 100644

--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -1995,7 +1995,7 @@ def ICXWriteResGroup174 : SchedWriteRes<[ICXPort01]> {
 }
 def: InstRW<[ICXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
 
-def ICXWriteResGroup174z : SchedWriteRes<[ICXPort05]> {
+def ICXWriteResGroup174z : SchedWriteRes<[ICXPort0]> {
   let Latency = 15;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
@@ -2163,7 +2163,7 @@ def ICXWriteResGroup211 : SchedWriteRes<[ICXPort23,ICXPort01]> {
 }
 def: InstRW<[ICXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>;
 
-def ICXWriteResGroup211_1 : SchedWriteRes<[ICXPort23,ICXPort05]> {
+def ICXWriteResGroup211_1 : SchedWriteRes<[ICXPort23,ICXPort0]> {
   let Latency = 22;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 1018e9bd3657f..9c35fd92fab4f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -779,7 +779,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
   static const CostKindTblEntry AVX512DQCostTable[] = {
     { ISD::MUL,  MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
     { ISD::MUL,  MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
-    { ISD::MUL,  MVT::v8i64, { 2, 15, 1, 3 } }  // pmullq
+    { ISD::MUL,  MVT::v8i64, { 3, 15, 1, 3 } }  // pmullq
   };
 
   // Look for AVX512DQ lowering tricks for custom cases.

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
index 31ca29a997d04..47c451e9f99ed 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -153,8 +153,8 @@ define i32 @smul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
@@ -387,8 +387,8 @@ define i32 @umul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index 0e5af249312dd..fc67e1f8fa37e 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -1024,7 +1024,7 @@ define i32 @mul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = mul <2 x i64> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I64 = mul <8 x i64> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = mul <4 x i32> undef, undef
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = mul <8 x i32> undef, undef

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index 10007f8e4599d..56efbb248fc4f 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -1109,8 +1109,8 @@ define i32 @smul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -1347,8 +1347,8 @@ define i32 @umul(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index 294345d04db56..19ebacec9b96a 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -70,7 +70,7 @@ define i32 @reduce_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %V1  = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)

diff  --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
index b2b9a933a42cb..1ed632617da08 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512dq.s
@@ -312,12 +312,12 @@ vpmovq2m          %zmm0, %k0
 # CHECK-NEXT:  2      9     1.00    *                   vfpclassss	$171, (%rax), %k1
 # CHECK-NEXT:  1      4     1.00                        vfpclassss	$171, %xmm16, %k1 {%k2}
 # CHECK-NEXT:  2      9     1.00    *                   vfpclassss	$171, (%rax), %k1 {%k2}
-# CHECK-NEXT:  3      15    1.50                        vpmullq	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  4      22    1.50    *                   vpmullq	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  3      15    1.50                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  4      22    1.50    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  3      15    1.50                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  4      22    1.50    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  3      15    3.00                        vpmullq	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  3      15    3.00                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  3      15    3.00                        vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  4      22    3.00    *                   vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     0.50                        vxorpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vxorpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  2      8     0.50    *                   vxorpd	(%rax){1to8}, %zmm17, %zmm19
@@ -357,7 +357,7 @@ vpmovq2m          %zmm0, %k0
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     66.00  10.00  46.50  46.50   -     96.00   -      -      -      -
+# CHECK-NEXT:  -      -     75.00  10.00  46.50  46.50   -     87.00   -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -486,12 +486,12 @@ vpmovq2m          %zmm0, %k0
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vfpclassss	$171, (%rax), %k1
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vfpclassss	$171, %xmm16, %k1 {%k2}
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vfpclassss	$171, (%rax), %k1 {%k2}
-# CHECK-NEXT:  -      -     1.50    -      -      -      -     1.50    -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     1.50    -     0.50   0.50    -     1.50    -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     1.50    -      -      -      -     1.50    -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     1.50    -     0.50   0.50    -     1.50    -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     1.50    -      -      -      -     1.50    -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     1.50    -     0.50   0.50    -     1.50    -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -      -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -      -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     3.00    -      -      -      -      -      -      -      -      -     vpmullq	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     3.00    -     0.50   0.50    -      -      -      -      -      -     vpmullq	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vxorpd	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vxorpd	(%rax), %zmm17, %zmm19
 # CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vxorpd	(%rax){1to8}, %zmm17, %zmm19