[llvm] [VectorCombine] foldPermuteOfBinops - fold "shuffle (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". (PR #122118)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 09:55:40 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/122118
>From 7d06d491fe46b1942dd5d64d9f695175c0e4dc57 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 8 Jan 2025 14:42:10 +0000
Subject: [PATCH 1/2] [VectorCombine] foldPermuteOfBinops - fold "shuffle
(binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)".
foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.
---
.../Transforms/Vectorize/VectorCombine.cpp | 44 +++++++++++--------
.../test/Transforms/PhaseOrdering/X86/hadd.ll | 14 +++---
.../test/Transforms/PhaseOrdering/X86/hsub.ll | 14 +++---
.../VectorCombine/X86/permute-of-binops.ll | 30 ++++++++-----
4 files changed, 57 insertions(+), 45 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 120eafae8c5ac5..e9478103f36288 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
return false;
- Value *Op00, *Op01;
- ArrayRef<int> Mask0;
- if (!match(BinOp->getOperand(0),
- m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+ Value *Op00, *Op01, *Op10, *Op11;
+ ArrayRef<int> Mask0, Mask1;
+ bool Match0 =
+ match(BinOp->getOperand(0),
+ m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+ bool Match1 =
+ match(BinOp->getOperand(1),
+ m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+ if (!Match0 && !Match1)
return false;
- Value *Op10, *Op11;
- ArrayRef<int> Mask1;
- if (!match(BinOp->getOperand(1),
- m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
- return false;
+ Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+ Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+ Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+ Op11 = Match1 ? Op11 : BinOp->getOperand(1);
Instruction::BinaryOps Opcode = BinOp->getOpcode();
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,15 +1624,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
return false;
- // Merge outer / inner shuffles.
+ // Merge outer / inner (or identity if no match) shuffles.
SmallVector<int> NewMask0, NewMask1;
for (int M : OuterMask) {
if (M < 0 || M >= (int)NumSrcElts) {
NewMask0.push_back(PoisonMaskElem);
NewMask1.push_back(PoisonMaskElem);
} else {
- NewMask0.push_back(Mask0[M]);
- NewMask1.push_back(Mask1[M]);
+ NewMask0.push_back(Match0 ? Mask0[M] : M);
+ NewMask1.push_back(Match1 ? Mask1[M] : M);
}
}
@@ -1636,13 +1640,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
- OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
- CostKind, 0, nullptr, {Op00, Op01},
- cast<Instruction>(BinOp->getOperand(0))) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
- CostKind, 0, nullptr, {Op10, Op11},
- cast<Instruction>(BinOp->getOperand(1)));
+ OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+ if (Match0)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+ Mask0, CostKind, 0, nullptr, {Op00, Op01},
+ cast<Instruction>(BinOp->getOperand(0)));
+ if (Match1)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+ Mask1, CostKind, 0, nullptr, {Op10, Op11},
+ cast<Instruction>(BinOp->getOperand(1)));
InstructionCost NewCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a335117..4863ea91803ad5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -801,10 +801,10 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea6..4f67ee0cb18c04 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -801,10 +801,10 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d8..1dc324bbd63ff9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
ret <4 x double> %post
}
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT: ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT: ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT: ret <4 x double> [[POST]]
;
%a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
%post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
ret <4 x i32> %post
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
>From a91da15be4bb95c082823d37522723f7964fa8a4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 13 Jan 2025 17:55:26 +0000
Subject: [PATCH 2/2] Skip known identity shuffles
---
.../Transforms/Vectorize/VectorCombine.cpp | 27 +++--
.../test/Transforms/PhaseOrdering/X86/hadd.ll | 104 +++++++++--------
.../test/Transforms/PhaseOrdering/X86/hsub.ll | 105 +++++++++---------
.../AArch64/shuffletoidentity.ll | 3 +-
4 files changed, 121 insertions(+), 118 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 72e5fd617fe88a..ae2af6d3468799 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1636,6 +1636,10 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
}
}
+ unsigned NumOpElts = Op0Ty->getNumElements();
+ bool IsIdentity0 = ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
+ bool IsIdentity1 = ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
+
// Try to merge shuffles across the binop if the new shuffles are not costly.
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
@@ -1651,12 +1655,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
cast<Instruction>(BinOp->getOperand(1)));
InstructionCost NewCost =
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
- CostKind, 0, nullptr, {Op00, Op01}) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, NewMask1,
- CostKind, 0, nullptr, {Op10, Op11}) +
TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
+ if (!IsIdentity0)
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+ NewMask0, CostKind, 0, nullptr, {Op00, Op01});
+ if (!IsIdentity1)
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+ NewMask1, CostKind, 0, nullptr, {Op10, Op11});
+
LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
@@ -1665,16 +1672,18 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
if (NewCost > OldCost)
return false;
- Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0);
- Value *Shuf1 = Builder.CreateShuffleVector(Op10, Op11, NewMask1);
- Value *NewBO = Builder.CreateBinOp(Opcode, Shuf0, Shuf1);
+ Value *LHS =
+ IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
+ Value *RHS =
+ IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
+ Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
// Intersect flags from the old binops.
if (auto *NewInst = dyn_cast<Instruction>(NewBO))
NewInst->copyIRFlags(BinOp);
- Worklist.pushValue(Shuf0);
- Worklist.pushValue(Shuf1);
+ Worklist.pushValue(LHS);
+ Worklist.pushValue(RHS);
replaceValue(I, *NewBO);
return true;
}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 4863ea91803ad5..4b1234fda0e18b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -394,13 +394,13 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
; SSE4-NEXT: [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[TMP5]]
; SSE4-NEXT: [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP7]]
; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
; SSE4-NEXT: ret <8 x i32> [[RESULT]]
;
@@ -605,10 +605,10 @@ define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @add_v4f32_uu23(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
-; CHECK-NEXT: [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT: [[RESULT2:%.*]] = fadd <4 x float> [[TMP2]], [[RESULT1]]
+; CHECK-NEXT: ret <4 x float> [[RESULT2]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
@@ -632,10 +632,10 @@ define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
define <4 x float> @add_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @add_v4f32_01uu(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret <4 x float> [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
@@ -712,9 +712,9 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP5]], [[TMP8]]
; SSE2-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
@@ -724,13 +724,13 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE4-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP2]], [[TMP5]]
; SSE4-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP7]]
; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE4-NEXT: ret <8 x float> [[RESULT]]
;
@@ -801,9 +801,8 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[B]]
; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
@@ -820,10 +819,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[TMP1]], [[RESULT]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -884,9 +882,9 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -932,10 +930,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @add_v4f64_0u23(
@@ -980,9 +978,9 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -1028,9 +1026,9 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
; SSE4-NEXT: [[A23:%.*]] = fadd double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -1069,15 +1067,15 @@ define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT1]]
;
; SSE4-LABEL: @add_v4f64_uu23(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
; SSE4-NEXT: ret <4 x double> [[RESULT1]]
;
; AVX-LABEL: @add_v4f64_uu23(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; AVX-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT: ret <4 x double> [[RESULT1]]
;
%a0 = extractelement <4 x double> %a, i32 0
@@ -1109,15 +1107,15 @@ define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[TMP4]]
;
; SSE4-LABEL: @add_v4f64_01uu(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_01uu(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
; AVX-NEXT: ret <4 x double> [[TMP3]]
;
%a0 = extractelement <4 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 4f67ee0cb18c04..c9cba0a4cc0ff2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -394,13 +394,13 @@ define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
; SSE4-NEXT: [[A45:%.*]] = sub i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP7]], [[TMP5]]
; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
; SSE4-NEXT: ret <8 x i32> [[RESULT]]
;
@@ -605,10 +605,10 @@ define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_uu23(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
-; CHECK-NEXT: [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT: [[RESULT2:%.*]] = fsub <4 x float> [[TMP2]], [[RESULT1]]
+; CHECK-NEXT: ret <4 x float> [[RESULT2]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
@@ -632,10 +632,10 @@ define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @sub_v4f32_01uu(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: ret <4 x float> [[TMP3]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1
@@ -712,9 +712,9 @@ define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
; SSE2-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
@@ -724,13 +724,13 @@ define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
; SSE4-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP2]], [[TMP5]]
; SSE4-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE4-NEXT: ret <8 x float> [[RESULT]]
;
@@ -801,9 +801,8 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[B]]
; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
@@ -820,10 +819,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[TMP1]], [[RESULT]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -884,9 +882,9 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -932,14 +930,13 @@ define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = fsub <4 x double> [[TMP1]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP2]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_0u23(
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
; AVX-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT: ret <4 x double> [[TMP4]]
@@ -980,9 +977,9 @@ define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -1028,9 +1025,9 @@ define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
; SSE4-NEXT: [[A23:%.*]] = fsub double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
@@ -1069,15 +1066,15 @@ define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT1]]
;
; SSE4-LABEL: @sub_v4f64_uu23(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; SSE4-NEXT: ret <4 x double> [[RESULT1]]
;
; AVX-LABEL: @sub_v4f64_uu23(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; AVX-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
; AVX-NEXT: ret <4 x double> [[RESULT1]]
;
%a0 = extractelement <4 x double> %a, i32 0
@@ -1109,15 +1106,15 @@ define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[TMP4]]
;
; SSE4-LABEL: @sub_v4f64_01uu(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_01uu(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; AVX-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
; AVX-NEXT: ret <4 x double> [[TMP3]]
;
%a0 = extractelement <4 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index f4c27794d3930c..09875c5e0af409 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -937,9 +937,8 @@ define <4 x i64> @cast_mismatched_types(<4 x i32> %x) {
define <4 x float> @fadd_mismatched_types(<4 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @fadd_mismatched_types(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 2, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT: [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[EXTSHUF:%.*]] = fadd fast <4 x float> [[TMP1:%.*]], [[TMP2]]
; CHECK-NEXT: ret <4 x float> [[EXTSHUF]]
;
%shuf.x = shufflevector <4 x float> %x, <4 x float> poison, <2 x i32> <i32 0, i32 2>
More information about the llvm-commits
mailing list