[llvm] 595a743 - [CostModel][X86] Tweak SSE2 v2i64 multiply costs based off D46276 script
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 14 03:06:44 PDT 2023
Author: Simon Pilgrim
Date: 2023-06-14T11:06:15+01:00
New Revision: 595a74391daaff8daffa2f6e19274792d0074565
URL: https://github.com/llvm/llvm-project/commit/595a74391daaff8daffa2f6e19274792d0074565
DIFF: https://github.com/llvm/llvm-project/commit/595a74391daaff8daffa2f6e19274792d0074565.diff
LOG: [CostModel][X86] Tweak SSE2 v2i64 multiply costs based off D46276 script
It looks like we were trying to account for SLM costs, which are actually handled separately
Fixes #62969
Added:
Modified:
llvm/lib/Target/X86/X86TargetTransformInfo.cpp
llvm/test/Analysis/CostModel/X86/arith-fix.ll
llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
llvm/test/Analysis/CostModel/X86/arith-int.ll
llvm/test/Analysis/CostModel/X86/arith-overflow.ll
llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
llvm/test/Analysis/CostModel/X86/mul64.ll
llvm/test/Analysis/CostModel/X86/reduce-mul.ll
llvm/test/Analysis/CostModel/X86/rem-codesize.ll
llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
llvm/test/Analysis/CostModel/X86/rem.ll
llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index ecba9b20b8d62..7a5baa82fda22 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1335,7 +1335,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
- { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
+ { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
{ X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
diff --git a/llvm/test/Analysis/CostModel/X86/arith-fix.ll b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
index e976fb63e190b..3595f30826f36 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-fix.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-fix.ll
@@ -38,9 +38,9 @@ define i32 @smul(i32 %arg) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.smul.fix.i32(i32 undef, i32 undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4I32 = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8I32 = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V16I32 = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.smul.fix.i16(i16 undef, i16 undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8I16 = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16I16 = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
@@ -272,9 +272,9 @@ define i32 @umul(i32 %arg) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I64 = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> undef, <4 x i64> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I64 = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> undef, <8 x i64> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.umul.fix.i32(i32 undef, i32 undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> undef, <4 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8I32 = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> undef, <8 x i32> undef, i32 3)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16I32 = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> undef, <16 x i32> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call i16 @llvm.umul.fix.i16(i16 undef, i16 undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8I16 = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> undef, <8 x i16> undef, i32 3)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16I16 = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> undef, <16 x i16> undef, i32 3)
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
index f469967563b4b..64dce905e8aa1 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll
@@ -776,9 +776,9 @@ define i32 @and(i32 %arg) {
define i32 @mul(i32 %arg) {
; SSE2-LABEL: 'mul'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = mul i64 undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
index 580145e2f9b28..c5ae4d8c901a0 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll
@@ -620,9 +620,9 @@ define i32 @and(i32 %arg) {
define i32 @mul(i32 %arg) {
; SSE2-LABEL: 'mul'
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = mul <8 x i64> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = mul <4 x i32> undef, undef
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll
index cd70079965d33..45bb058873ddc 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-int.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll
@@ -890,9 +890,9 @@ define i32 @and(i32 %arg) {
define i32 @mul(i32 %arg) {
; SSSE3-LABEL: 'mul'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I64 = mul i64 undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = mul <2 x i64> undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = mul <4 x i64> undef, undef
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = mul <8 x i64> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = mul <2 x i64> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = mul <4 x i64> undef, undef
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = mul <8 x i64> undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = mul i32 undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = mul <4 x i32> undef, undef
; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = mul <8 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
index 56efbb248fc4f..ba745262d1890 100644
--- a/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -994,9 +994,9 @@ define i32 @smul(i32 %arg) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
@@ -1232,9 +1232,9 @@ define i32 @umul(i32 %arg) {
; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index c87ac685c365a..f5936a78f932f 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -51,7 +51,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
; THRU-LABEL: 'umul'
; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; THRU-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; THRU-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; LATE-LABEL: 'umul'
@@ -61,12 +61,12 @@ define void @umul(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
;
; SIZE-LABEL: 'umul'
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
; SIZE_LATE-LABEL: 'umul'
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> %va, <16 x i32> %vb)
; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%s = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Analysis/CostModel/X86/mul64.ll b/llvm/test/Analysis/CostModel/X86/mul64.ll
index 662604813ab66..718972093e8e5 100644
--- a/llvm/test/Analysis/CostModel/X86/mul64.ll
+++ b/llvm/test/Analysis/CostModel/X86/mul64.ll
@@ -31,12 +31,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_vXi8'
@@ -52,12 +52,12 @@ define void @mul_sext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i8> %b
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = sext <32 x i8> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = sext <64 x i8> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_vXi8'
@@ -453,12 +453,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_zext_vXi8'
@@ -474,12 +474,12 @@ define void @mul_sext_zext_vXi8(<2 x i8> %a2, <2 x i8> %b2, <4 x i8> %a4, <4 x i
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb32 = zext <32 x i8> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xa64 = sext <64 x i8> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %xb64 = zext <64 x i8> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_zext_vXi8'
@@ -689,12 +689,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1
; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_vXi16'
@@ -710,12 +710,12 @@ define void @mul_sext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4 x i1
; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %xb32 = sext <32 x i16> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xb64 = sext <64 x i16> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_vXi16'
@@ -1111,12 +1111,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_zext_vXi16'
@@ -1132,12 +1132,12 @@ define void @mul_sext_zext_vXi16(<2 x i16> %a2, <2 x i16> %b2, <4 x i16> %a4, <4
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = zext <32 x i16> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %xa64 = sext <64 x i16> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = zext <64 x i16> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_zext_vXi16'
@@ -1347,12 +1347,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_vXi32'
@@ -1368,12 +1368,12 @@ define void @mul_sext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4 x i3
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb32 = sext <32 x i32> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xb64 = sext <64 x i32> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_vXi32'
@@ -1769,12 +1769,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSE2-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSSE3-LABEL: 'mul_sext_zext_vXi32'
@@ -1790,12 +1790,12 @@ define void @mul_sext_zext_vXi32(<2 x i32> %a2, <2 x i32> %b2, <4 x i32> %a4, <4
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %xb32 = zext <32 x i32> %b32 to <32 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %xa64 = sext <64 x i32> %a64 to <64 x i64>
; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %xb64 = zext <64 x i32> %b64 to <64 x i64>
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %res2 = mul <2 x i64> %xa2, %xb2
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %res4 = mul <4 x i64> %xa4, %xb4
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res8 = mul <8 x i64> %xa8, %xb8
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %res16 = mul <16 x i64> %xa16, %xb16
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %res32 = mul <32 x i64> %xa32, %xb32
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %res64 = mul <64 x i64> %xa64, %xb64
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SSE42-LABEL: 'mul_sext_zext_vXi32'
diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
index 19ebacec9b96a..93d32466136d7 100644
--- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -11,18 +11,18 @@
define i32 @reduce_i64(i32 %arg) {
; SSE2-LABEL: 'reduce_i64'
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i64'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
-; SSSE3-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
+; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i64'
diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
index 108db7293baa6..9ae78fb3ec606 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll
@@ -272,9 +272,9 @@ define i32 @urem_uniformconst() {
define i32 @srem_constpow2() {
; SSE2-LABEL: 'srem_constpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 156 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -510,9 +510,9 @@ define i32 @urem_constpow2() {
define i32 @srem_uniformconstpow2() {
; SSE2-LABEL: 'srem_uniformconstpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
index ea52ce2a832e1..f737b4369d853 100644
--- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll
@@ -272,9 +272,9 @@ define i32 @urem_uniformconst() {
define i32 @srem_constpow2() {
; SSE2-LABEL: 'srem_constpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -510,9 +510,9 @@ define i32 @urem_constpow2() {
define i32 @srem_uniformconstpow2() {
; SSE2-LABEL: 'srem_uniformconstpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll
index a5f2b85ca446f..8bbf775f8efd0 100644
--- a/llvm/test/Analysis/CostModel/X86/rem.ll
+++ b/llvm/test/Analysis/CostModel/X86/rem.ll
@@ -709,9 +709,9 @@ define i32 @urem_uniformconst() {
define i32 @srem_constpow2() {
; SSE2-LABEL: 'srem_constpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2i64 = srem <2 x i64> undef, <i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4i64 = srem <4 x i64> undef, <i64 2, i64 4, i64 8, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8i64 = srem <8 x i64> undef, <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, <i32 2, i32 4, i32 8, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, <i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256>
@@ -966,9 +966,9 @@ define i32 @urem_constpow2() {
define i32 @srem_uniformconstpow2() {
; SSE2-LABEL: 'srem_uniformconstpow2'
; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2i64 = srem <2 x i64> undef, <i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4i64 = srem <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8i64 = srem <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i32 = srem <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
index d764d5c05a2f6..4b5b5da21cd21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/mul64.ll
@@ -1,27 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s --check-prefix=SCALAR
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64 | FileCheck %s
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v2 | FileCheck %s
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v3 | FileCheck %s
; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S -mcpu=x86-64-v4 | FileCheck %s
define void @PR62969(ptr dereferenceable(16) %out, ptr dereferenceable(16) %in) {
-; SCALAR-LABEL: @PR62969(
-; SCALAR-NEXT: [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0
-; SCALAR-NEXT: [[IN1:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN]], i64 0, i64 1
-; SCALAR-NEXT: [[X:%.*]] = load i64, ptr [[IN0]], align 8
-; SCALAR-NEXT: [[Y:%.*]] = load i64, ptr [[IN1]], align 8
-; SCALAR-NEXT: [[XL:%.*]] = and i64 [[X]], 4294967295
-; SCALAR-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295
-; SCALAR-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32
-; SCALAR-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32
-; SCALAR-NEXT: [[M0:%.*]] = mul i64 [[XL]], [[XH]]
-; SCALAR-NEXT: [[M1:%.*]] = mul i64 [[YL]], [[YH]]
-; SCALAR-NEXT: [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0
-; SCALAR-NEXT: [[OUT1:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT]], i64 0, i64 1
-; SCALAR-NEXT: store i64 [[M0]], ptr [[OUT0]], align 8
-; SCALAR-NEXT: store i64 [[M1]], ptr [[OUT1]], align 8
-; SCALAR-NEXT: ret void
-;
; CHECK-LABEL: @PR62969(
; CHECK-NEXT: [[IN0:%.*]] = getelementptr inbounds [2 x i64], ptr [[IN:%.*]], i64 0, i64 0
; CHECK-NEXT: [[OUT0:%.*]] = getelementptr inbounds [2 x i64], ptr [[OUT:%.*]], i64 0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index 2ec4c04c2c8dc..cf3d40df15dad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -4,49 +4,25 @@
define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[P0:%.*]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[A2:%.*]] = add i64 [[P2:%.*]], [[P2]]
-; CHECK-NEXT: [[A3:%.*]] = add i64 [[P3:%.*]], [[P3]]
-; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[M2:%.*]] = mul i64 [[P2]], [[P2]]
-; CHECK-NEXT: [[M3:%.*]] = mul i64 [[P3]], [[P3]]
-; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i64> [[TMP1]], [[TMP1]]
-; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]]
-; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT: [[S0:%.*]] = sub i64 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT: [[S1:%.*]] = sub i64 [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[S2:%.*]] = sub i64 [[M2]], [[D2]]
-; CHECK-NEXT: [[S3:%.*]] = sub i64 [[M3]], [[D3]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT: [[SHL1:%.*]] = shl i64 [[TMP9]], [[S0]]
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT: [[SHL2:%.*]] = shl i64 [[TMP10]], [[S1]]
-; CHECK-NEXT: [[SHL3:%.*]] = shl i64 [[A2]], [[S2]]
-; CHECK-NEXT: [[SHL4:%.*]] = shl i64 [[A3]], [[S3]]
-; CHECK-NEXT: [[O0:%.*]] = or i64 [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TT0:%.*]] = trunc i64 [[O0]] to i32
-; CHECK-NEXT: [[O1:%.*]] = or i64 [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TT1:%.*]] = trunc i64 [[O1]] to i32
-; CHECK-NEXT: [[O2:%.*]] = or i64 [[TMP5]], [[TMP7]]
-; CHECK-NEXT: [[TT2:%.*]] = trunc i64 [[O2]] to i32
-; CHECK-NEXT: [[O3:%.*]] = or i64 [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TT3:%.*]] = trunc i64 [[O3]] to i32
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
; CHECK-NEXT: br label [[BB:%.*]]
; CHECK: bb:
-; CHECK-NEXT: [[PHI0:%.*]] = phi i32 [ [[T1:%.*]], [[BB]] ], [ [[TT0]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[T2:%.*]], [[BB]] ], [ [[TT1]], [[ENTRY]] ]
-; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[T3:%.*]], [[BB]] ], [ [[TT2]], [[ENTRY]] ]
-; CHECK-NEXT: [[PHI3:%.*]] = phi i32 [ [[T4:%.*]], [[BB]] ], [ [[TT3]], [[ENTRY]] ]
-; CHECK-NEXT: [[T1]] = trunc i64 [[SHL1]] to i32
-; CHECK-NEXT: [[T2]] = trunc i64 [[SHL2]] to i32
-; CHECK-NEXT: [[T3]] = trunc i64 [[SHL3]] to i32
-; CHECK-NEXT: [[T4]] = trunc i64 [[SHL4]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
; CHECK-NEXT: br label [[BB]]
;
entry:
More information about the llvm-commits
mailing list