[llvm] 444685d - [CostModel][X86] Adjust mul v4i32/v8i32 throughput cost

Sat Sep 3 10:45:27 PDT 2022

Author: Simon Pilgrim
Date: 2022-09-03T18:45:08+01:00
New Revision: 444685de06e0e8257aa913b198e2a185ff8ba7f5

URL: https://github.com/llvm/llvm-project/commit/444685de06e0e8257aa913b198e2a185ff8ba7f5
DIFF: https://github.com/llvm/llvm-project/commit/444685de06e0e8257aa913b198e2a185ff8ba7f5.diff

LOG: [CostModel][X86] Adjust mul v4i32/v8i32 throughput cost

Based off the numbers from AMD SoG + Agner - vXi32 are both half-rate, and znver1 double-pumps the v8i32 op

We should have caught this earlier as many Intel models have half-rate pmulld already :-(

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ScheduleZnver1.td
    llvm/lib/Target/X86/X86ScheduleZnver2.td
    llvm/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/X86/arith-fix.ll
    llvm/test/Analysis/CostModel/X86/arith-int.ll
    llvm/test/Analysis/CostModel/X86/arith-overflow.ll
    llvm/test/Analysis/CostModel/X86/mul.ll
    llvm/test/Analysis/CostModel/X86/mul32.ll
    llvm/test/Analysis/CostModel/X86/reduce-mul.ll
    llvm/test/Analysis/CostModel/X86/rem.ll
    llvm/test/tools/llvm-mca/X86/Znver1/resources-avx1.s
    llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
    llvm/test/tools/llvm-mca/X86/Znver1/resources-sse41.s
    llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s
    llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
    llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 054fff0ccdf1..463ec90f68be 100644

--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -424,8 +424,8 @@ defm : ZnWriteResFpuPair<WriteVecIMul,    [ZnFPU0],  4>;
 defm : ZnWriteResFpuPair<WriteVecIMulX,   [ZnFPU0],  4>;
 defm : ZnWriteResFpuPair<WriteVecIMulY,   [ZnFPU0],  4>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
-defm : ZnWriteResFpuPair<WritePMULLD,     [ZnFPU0],  4, [1], 1, 7, 1>; // FIXME
-defm : ZnWriteResFpuPair<WritePMULLDY,    [ZnFPU0],  5, [2], 1, 7, 1>; // FIXME
+defm : ZnWriteResFpuPair<WritePMULLD,     [ZnFPU0],  4, [2]>;
+defm : ZnWriteResFpuPair<WritePMULLDY,    [ZnFPU0],  4, [4], 2>;
 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 defm : ZnWriteResFpuPair<WriteShuffle,    [ZnFPU],   1>;
 defm : ZnWriteResFpuPair<WriteShuffleX,   [ZnFPU],   1>;

diff  --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index ad571ba65c39..ebb12a70e2d1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -423,8 +423,8 @@ defm : Zn2WriteResFpuPair<WriteVecIMul,    [Zn2FPU0],  4>;
 defm : Zn2WriteResFpuPair<WriteVecIMulX,   [Zn2FPU0],  4>;
 defm : Zn2WriteResFpuPair<WriteVecIMulY,   [Zn2FPU0],  4>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
-defm : Zn2WriteResFpuPair<WritePMULLD,     [Zn2FPU0],  4, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLD,     [Zn2FPU0],  4, [2]>;
+defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  4, [2]>;
 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 defm : Zn2WriteResFpuPair<WriteShuffle,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteShuffleX,   [Zn2FPU],   1>;

diff  --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b505e271e7ee..a3edca048c23 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -907,7 +907,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
     { ISD::ADD,  MVT::v4i64,   {  1,  1, 1, 2 } }, // paddq
 
     { ISD::MUL,  MVT::v16i16,  {  1 } }, // pmullw
-    { ISD::MUL,  MVT::v8i32,   {  2 } }, // pmulld (Haswell from agner.org)
+    { ISD::MUL,  MVT::v8i32,   {  4 } }, // pmulld
     { ISD::MUL,  MVT::v4i64,   {  6 } }, // 3*pmuludq/3*shift/2*add
 
     { ISD::FNEG, MVT::v4f64,   {  1,  1, 1, 2 } }, // vxorpd

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
index 6afe180f60b1..6610b8176375 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -99,9 +99,9 @@ define i32 @smul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.smul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.smul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)
@@ -333,9 +333,9 @@ define i32 @umul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.umul.fix.v32i16(<32 x i16> undef, <32 x i16> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I8 = call i8 @llvm.umul.fix.i8(i8 undef, i8 undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> undef, <16 x i8> undef, i32 3)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> undef, <32 x i8> undef, i32 3)

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index 547ee8942a83..58196b09ac1d 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -999,8 +999,8 @@ define i32 @mul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = mul <8 x i64> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = mul <4 x i32> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = mul <8 x i32> undef, undef
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = mul <16 x i32> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = mul <8 x i32> undef, undef
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = mul <16 x i32> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = mul i16 undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = mul <8 x i16> undef, undef
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = mul <16 x i16> undef, undef

diff  --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index a6823ddc980b..eeeac179d10a 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -1055,9 +1055,9 @@ define i32 @smul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
@@ -1293,9 +1293,9 @@ define i32 @umul(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)

diff  --git a/llvm/test/Analysis/CostModel/X86/mul.ll b/llvm/test/Analysis/CostModel/X86/mul.ll
index 939026892a7b..63329e514447 100644
--- a/llvm/test/Analysis/CostModel/X86/mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/mul.ll
@@ -447,8 +447,8 @@ define i32 @mul_constnegpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -2, i64 -4, i64 -8, i64 -16, i64 -32, i64 -64, i64 -128, i64 -256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, -16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256, i32 -2, i32 -4, i32 -8, i32 -16, i32 -32, i32 -64, i32 -128, i32 -256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = mul i16 undef, -16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
@@ -623,8 +623,8 @@ define i32 @mul_uniformconstnegpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8i64 = mul <8 x i64> undef, <i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16, i64 -16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, -16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = mul <4 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = mul <8 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = mul <16 x i32> undef, <i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16, i32 -16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = mul i16 undef, -16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>

diff  --git a/llvm/test/Analysis/CostModel/X86/mul32.ll b/llvm/test/Analysis/CostModel/X86/mul32.ll
index d992d2167949..805c683373e6 100644
--- a/llvm/test/Analysis/CostModel/X86/mul32.ll
+++ b/llvm/test/Analysis/CostModel/X86/mul32.ll
@@ -105,10 +105,10 @@ define void @mul_sext_vXi8(<4 x i8> %a4, <4 x i8> %b4, <8 x i8> %a8, <8 x i8> %b
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res4 = mul <4 x i32> %xa4, %xb4
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'mul_sext_vXi8'
@@ -761,10 +761,10 @@ define void @mul_zext_vXi16(<4 x i16> %a4, <4 x i16> %b4, <8 x i16> %a8, <8 x i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %xa64 = zext <64 x i16> %a64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res4 = mul <4 x i32> %xa4, %xb4
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'mul_zext_vXi16'
@@ -924,10 +924,10 @@ define void @mul_sext_zext_vXi16(<4 x i16> %a4, <4 x i16> %b4, <8 x i16> %a8, <8
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i32>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res4 = mul <4 x i32> %xa4, %xb4
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res8 = mul <8 x i32> %xa8, %xb8
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res16 = mul <16 x i32> %xa16, %xb16
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res32 = mul <32 x i32> %xa32, %xb32
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res64 = mul <64 x i32> %xa64, %xb64
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'mul_sext_zext_vXi16'

diff  --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index 53ed838339eb..4e6d020c5f73 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -118,8 +118,8 @@ define i32 @reduce_i32(i32 %arg) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> undef)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512-LABEL: 'reduce_i32'

diff  --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll
index 31839c7719f4..605bdf41e54a 100644
--- a/llvm/test/Analysis/CostModel/X86/rem.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem.ll
@@ -885,8 +885,8 @@ define i32 @srem_constpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = srem <16 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8i16 = srem <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16i16 = srem <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
@@ -1180,8 +1180,8 @@ define i32 @srem_uniformconstpow2() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i32 = srem <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = srem <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx1.s
index 2a49fb5fd6d3..068341e79fe7 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx1.s
@@ -1538,8 +1538,8 @@ vzeroupper
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      11    2.00    *                   vpmulld	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
@@ -1738,7 +1738,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT: 175.00 175.00  -      -      -      -      -     151.58 191.58 218.75 513.08  -
+# CHECK-NEXT: 175.00 175.00  -      -      -      -      -     153.58 191.58 218.75 513.08  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2250,8 +2250,8 @@ vzeroupper
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmulhuw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmullw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
index f5a4d491dae1..669e6f7fdb42 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-avx2.s
@@ -657,8 +657,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhuw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmulhw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhw	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      5     2.00                        vpmulld	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpmulld	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      4     4.00                        vpmulld	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      11    4.00    *                   vpmulld	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmullw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmullw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmuludq	%ymm0, %ymm1, %ymm2
@@ -778,7 +778,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT: 67.00  67.00   -      -      -      -      -     96.67  183.17 164.50 64.67   -
+# CHECK-NEXT: 67.00  67.00   -      -      -      -      -     100.67 183.17 164.50 64.67   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -979,8 +979,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmulhuw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulhw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmulhw	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vpmulld	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vpmulld	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     4.00    -      -      -      -     vpmulld	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     4.00    -      -      -      -     vpmulld	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmullw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vpmullw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vpmuludq	%ymm0, %ymm1, %ymm2

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-sse41.s
index fb39f9471654..e3a05b11a5e9 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-sse41.s
@@ -239,8 +239,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   pmovzxwq	(%rax), %xmm2
 # CHECK-NEXT:  1      4     1.00                        pmuldq	%xmm0, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        pmulld	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  1      4     2.00                        pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  1      2     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  2      9     1.00    *                   ptest	(%rax), %xmm1
 # CHECK-NEXT:  1      4     1.00                        roundpd	$1, %xmm0, %xmm2
@@ -268,7 +268,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT: 25.00  25.00   -      -      -      -      -     25.17  26.67  44.00  21.17   -
+# CHECK-NEXT: 25.00  25.00   -      -      -      -      -     27.17  26.67  44.00  21.17   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -356,8 +356,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     0.25   0.25   0.25   0.25    -     pmovzxwq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     pmuldq	%xmm0, %xmm2
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     pmulld	%xmm0, %xmm2
-# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -     ptest	(%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -     roundpd	$1, %xmm0, %xmm2

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s
index c5a006ef2ee9..faea6244f8c1 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx1.s
@@ -1538,8 +1538,8 @@ vzeroupper
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      4     1.00                        vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      11    2.00    *                   vpmulld	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
@@ -1739,7 +1739,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 117.00 117.00 117.00 0.25   0.25   0.25   0.25    -     132.92 169.92 204.75 465.42  -
+# CHECK-NEXT: 117.00 117.00 117.00 0.25   0.25   0.25   0.25    -     134.92 169.92 204.75 465.42  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -2251,8 +2251,8 @@ vzeroupper
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulhuw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulhw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     2.00    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmullw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
index 142d3e89bff6..0ac2239faf0b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-avx2.s
@@ -657,8 +657,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhuw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmulhw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmulhw	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      4     1.00                        vpmulld	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      11    1.00    *                   vpmulld	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     2.00                        vpmulld	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    2.00    *                   vpmulld	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmullw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      11    1.00    *                   vpmullw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      4     1.00                        vpmuludq	%ymm0, %ymm1, %ymm2
@@ -779,7 +779,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 44.67  44.67  44.67   -      -      -      -      -     68.17  103.17 81.00  40.67   -
+# CHECK-NEXT: 44.67  44.67  44.67   -      -      -      -      -     70.17  103.17 81.00  40.67   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -980,8 +980,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulhuw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulhw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulhw	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmulld	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmulld	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00    -      -      -      -     vpmulld	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     2.00    -      -      -      -     vpmulld	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmullw	%ymm0, %ymm1, %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     vpmullw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     vpmuludq	%ymm0, %ymm1, %ymm2

diff  --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s
index d15850a3c1f5..401c50bc4b7e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-sse41.s
@@ -239,8 +239,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  1      8     0.33    *                   pmovzxwq	(%rax), %xmm2
 # CHECK-NEXT:  1      4     1.00                        pmuldq	%xmm0, %xmm2
 # CHECK-NEXT:  1      11    1.00    *                   pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  1      4     1.00                        pmulld	%xmm0, %xmm2
-# CHECK-NEXT:  2      11    1.00    *                   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  1      4     2.00                        pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  1      11    2.00    *                   pmulld	(%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        ptest	%xmm0, %xmm1
 # CHECK-NEXT:  2      10    1.00    *                   ptest	(%rax), %xmm1
 # CHECK-NEXT:  1      3     1.00                        roundpd	$1, %xmm0, %xmm2
@@ -269,7 +269,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 17.00  17.00  17.00   -      -      -      -      -     25.17  26.67  44.00  21.17   -
+# CHECK-NEXT: 17.00  17.00  17.00   -      -      -      -      -     27.17  26.67  44.00  21.17   -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -357,8 +357,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     0.25   0.25   0.25   0.25    -     pmovzxwq	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     pmuldq	%xmm0, %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     pmuldq	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -      -      -     pmulld	%xmm0, %xmm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     1.00    -      -      -      -     pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00    -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -     2.00    -      -      -      -     pmulld	(%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     ptest	%xmm0, %xmm1
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00   1.00    -      -     ptest	(%rax), %xmm1
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -     roundpd	$1, %xmm0, %xmm2