[llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 29 09:44:58 PST 2025
- Previous message: [llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
- Next message: [llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/124909
>From b006ef112b5a46b489302ce8484b8f1824bef8bc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 29 Jan 2025 11:41:16 +0000
Subject: [PATCH] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx)
--> binop(insert(x,a,idx),insert(x,a,idx))
Add foldInsExtBinop fold to cleanup missed vectorization cases which can happen on targets with cheap insert/extract instructions which prevent foldExtractExtract (binop(extract(x),extract(y)) -> extract(binop(x,shuffle(y)))) from helping with the merge.
WIP - still need to add VectorCombine specific tests
---
.../Transforms/Vectorize/VectorCombine.cpp | 60 ++++++
.../test/Transforms/PhaseOrdering/X86/hadd.ll | 180 ++++++------------
.../test/Transforms/PhaseOrdering/X86/hsub.ll | 179 ++++++-----------
.../Transforms/PhaseOrdering/X86/pr50392.ll | 10 +-
4 files changed, 169 insertions(+), 260 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 59920b5a4dd20a..09e4b8cc17fbbc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -108,6 +108,7 @@ class VectorCombine {
Instruction &I);
bool foldExtractExtract(Instruction &I);
bool foldInsExtFNeg(Instruction &I);
+ bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);
@@ -738,6 +739,64 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
return true;
}
+/// Try to fold insert(binop(x,y),binop(a,b),idx)
+/// --> binop(insert(x,a,idx),insert(x,a,idx))
+bool VectorCombine::foldInsExtBinop(Instruction &I) {
+ BinaryOperator *VecBinOp, *SclBinOp;
+ uint64_t Index;
+ if (!match(&I,
+ m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
+ m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
+ return false;
+
+ // TODO: Add support for addlike etc.
+ Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
+ if (BinOpcode != SclBinOp->getOpcode())
+ return false;
+
+ auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!ResultTy)
+ return false;
+
+ // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
+ // shuffle?
+
+ InstructionCost OldCost = TTI.getInstructionCost(&I, CostKind) +
+ TTI.getInstructionCost(VecBinOp, CostKind) +
+ TTI.getInstructionCost(SclBinOp, CostKind);
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
+ TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+ Index, VecBinOp->getOperand(0),
+ SclBinOp->getOperand(0)) +
+ TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+ Index, VecBinOp->getOperand(1),
+ SclBinOp->getOperand(1));
+
+ LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+ if (NewCost > OldCost)
+ return false;
+
+ Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
+ SclBinOp->getOperand(0), Index);
+ Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
+ SclBinOp->getOperand(1), Index);
+ Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
+
+ // Intersect flags from the old binops.
+ if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+ NewInst->copyIRFlags(VecBinOp);
+ NewInst->andIRFlags(SclBinOp);
+ }
+
+ Worklist.pushValue(NewIns0);
+ Worklist.pushValue(NewIns1);
+ replaceValue(I, *NewBO);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -3206,6 +3265,7 @@ bool VectorCombine::run() {
switch (Opcode) {
case Instruction::InsertElement:
MadeChange |= foldInsExtFNeg(I);
+ MadeChange |= foldInsExtBinop(I);
MadeChange |= foldInsExtVectorToShuffle(I);
break;
case Instruction::ShuffleVector:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 9b139a4a4e234c..28b48bd3ce6d9e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -272,19 +272,17 @@ define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE2-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE2-NEXT: [[A89:%.*]] = add i16 [[A8]], [[A9]]
; SSE2-NEXT: [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
; SSE2-NEXT: [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
; SSE2-NEXT: [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
; SSE2-NEXT: [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
; SSE2-NEXT: [[BCD:%.*]] = add i16 [[BC]], [[BD]]
; SSE2-NEXT: [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
@@ -295,22 +293,16 @@ define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE4-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE4-NEXT: [[A89:%.*]] = add i16 [[A8]], [[A9]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[HADDB2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
; SSE4-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB2]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE4-NEXT: ret <16 x i16> [[RESULT]]
+; SSE4-NEXT: ret <16 x i16> [[TMP9]]
;
; AVX2-LABEL: @add_v16i16_0123u56789uBCDEF(
; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 9, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
@@ -394,19 +386,17 @@ define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: @add_v16i16_FEuCBA98765432u0(
-; SSE2-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE2-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE2-NEXT: [[A89:%.*]] = add i16 [[A8]], [[A9]]
; SSE2-NEXT: [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
; SSE2-NEXT: [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
; SSE2-NEXT: [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
; SSE2-NEXT: [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
; SSE2-NEXT: [[BCD:%.*]] = add i16 [[BC]], [[BD]]
; SSE2-NEXT: [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
@@ -417,21 +407,17 @@ define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE4-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE4-NEXT: [[A89:%.*]] = add i16 [[A8]], [[A9]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HADD8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[HADDA2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HADDA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> <i32 3, i32 2, i32 poison, i32 1, i32 0, i32 27, i32 26, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
; SSE4-NEXT: ret <16 x i16> [[RESULT]]
;
; AVX2-LABEL: @add_v16i16_FEuCBA98765432u0(
@@ -826,31 +812,11 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
-; SSE2-NEXT: [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: ret <8 x i32> [[RESULT]]
-;
-; SSE4-LABEL: @add_v8i32_01234u67(
-; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
-; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE4-NEXT: [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x i32> [[RESULT]]
-;
-; AVX-LABEL: @add_v8i32_01234u67(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
-; AVX-NEXT: [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT: ret <8 x i32> [[RESULT]]
+; CHECK-LABEL: @add_v8i32_01234u67(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
+; CHECK-NEXT: [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -1148,33 +1114,21 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: @add_v8f32_012u4567(
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE2-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP5]], [[TMP8]]
-; SSE2-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @add_v8f32_012u4567(
-; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE4-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP7]]
-; SSE4-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
; SSE4-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[TMP8]], [[TMP5]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x float> [[RESULT]]
+; SSE4-NEXT: ret <8 x float> [[TMP9]]
;
; AVX-LABEL: @add_v8f32_012u4567(
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
@@ -1236,18 +1190,10 @@ define <8 x float> @add_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @add_v8f32_76u43210(
-; SSE4-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
-; SSE4-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
-; SSE4-NEXT: [[B01:%.*]] = fadd float [[B0]], [[B1]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HADD4:%.*]] = insertelement <8 x float> [[RESULT]], float [[B01]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 4, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
; SSE4-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[RESULT1:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[HADD4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
-; SSE4-NEXT: ret <8 x float> [[RESULT1]]
+; SSE4-NEXT: ret <8 x float> [[TMP6]]
;
; AVX-LABEL: @add_v8f32_76u43210(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
@@ -1394,14 +1340,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_u123(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_u123(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
@@ -1442,14 +1384,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_0u23(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP4]]
;
; AVX-LABEL: @add_v4f64_0u23(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
@@ -1490,14 +1428,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_01u3(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_01u3(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
@@ -1538,14 +1472,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_012u(
-; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
-; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE4-NEXT: [[A23:%.*]] = fadd double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_012u(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
@@ -1666,14 +1596,10 @@ define <4 x double> @add_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_32u0(
-; SSE4-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE4-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE4-NEXT: [[A01:%.*]] = fadd double [[A0]], [[A1]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
; SSE4-NEXT: [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[RESULT1:%.*]] = insertelement <4 x double> [[RESULT]], double [[A01]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT1]]
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @add_v4f64_32u0(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 998877e246b164..0062527b678c95 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -272,19 +272,17 @@ define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE2-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE2-NEXT: [[A89:%.*]] = sub i16 [[A8]], [[A9]]
; SSE2-NEXT: [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
; SSE2-NEXT: [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
; SSE2-NEXT: [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
; SSE2-NEXT: [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
; SSE2-NEXT: [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
; SSE2-NEXT: [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
@@ -295,22 +293,16 @@ define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE4-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE4-NEXT: [[A89:%.*]] = sub i16 [[A8]], [[A9]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[HSUBB2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
; SSE4-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB2]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SSE4-NEXT: ret <16 x i16> [[RESULT]]
+; SSE4-NEXT: ret <16 x i16> [[TMP9]]
;
; AVX-LABEL: @sub_v16i16_0123u56789uBCDEF(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
@@ -388,19 +380,17 @@ define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: @sub_v16i16_FEuCBA98765432u0(
-; SSE2-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE2-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE2-NEXT: [[A89:%.*]] = sub i16 [[A8]], [[A9]]
; SSE2-NEXT: [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
; SSE2-NEXT: [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
; SSE2-NEXT: [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
; SSE2-NEXT: [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
; SSE2-NEXT: [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
; SSE2-NEXT: [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
@@ -411,21 +401,17 @@ define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
-; SSE4-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
-; SSE4-NEXT: [[A89:%.*]] = sub i16 [[A8]], [[A9]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[A89]], i64 8
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[HSUBA2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HSUBA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> <i32 3, i32 2, i32 poison, i32 1, i32 0, i32 27, i32 26, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
; SSE4-NEXT: ret <16 x i16> [[RESULT]]
;
; AVX2-LABEL: @sub_v16i16_FEuCBA98765432u0(
@@ -820,31 +806,11 @@ define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: @sub_v8i32_01234u67(
-; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
-; SSE2-NEXT: [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: ret <8 x i32> [[RESULT]]
-;
-; SSE4-LABEL: @sub_v8i32_01234u67(
-; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
-; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE4-NEXT: [[A45:%.*]] = sub i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x i32> [[RESULT]]
-;
-; AVX-LABEL: @sub_v8i32_01234u67(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
-; AVX-NEXT: [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT: ret <8 x i32> [[RESULT]]
+; CHECK-LABEL: @sub_v8i32_01234u67(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; CHECK-NEXT: [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -1142,33 +1108,21 @@ define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: @sub_v8f32_012u4567(
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE2-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
-; SSE2-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @sub_v8f32_012u4567(
-; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE4-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
-; SSE4-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
-; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
; SSE4-NEXT: [[TMP9:%.*]] = fsub <8 x float> [[TMP8]], [[TMP5]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x float> [[RESULT]]
+; SSE4-NEXT: ret <8 x float> [[TMP9]]
;
; AVX-LABEL: @sub_v8f32_012u4567(
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
@@ -1230,18 +1184,10 @@ define <8 x float> @sub_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @sub_v8f32_76u43210(
-; SSE4-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
-; SSE4-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
-; SSE4-NEXT: [[B01:%.*]] = fsub float [[B0]], [[B1]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[HSUB4:%.*]] = insertelement <8 x float> [[RESULT]], float [[B01]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 5, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
-; SSE4-NEXT: [[RESULT1:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[HSUB4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
-; SSE4-NEXT: ret <8 x float> [[RESULT1]]
+; SSE4-NEXT: ret <8 x float> [[TMP6]]
;
; AVX-LABEL: @sub_v8f32_76u43210(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
@@ -1388,14 +1334,10 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_u123(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_u123(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
@@ -1436,13 +1378,10 @@ define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_0u23(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
; SSE4-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP4]]
;
; AVX-LABEL: @sub_v4f64_0u23(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
@@ -1483,14 +1422,10 @@ define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_01u3(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_01u3(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
@@ -1531,14 +1466,10 @@ define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_012u(
-; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
-; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE4-NEXT: [[A23:%.*]] = fsub double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_012u(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
@@ -1659,14 +1590,10 @@ define <4 x double> @sub_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_32u0(
-; SSE4-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; SSE4-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; SSE4-NEXT: [[A01:%.*]] = fsub double [[A0]], [[A1]]
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
; SSE4-NEXT: [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[RESULT1:%.*]] = insertelement <4 x double> [[RESULT]], double [[A01]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT1]]
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
;
; AVX-LABEL: @sub_v4f64_32u0(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index d92df9741644b7..842cf42505673c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -21,14 +21,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[SHUFFLE]]
;
; SSE4-LABEL: @PR50392(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE4-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
-; SSE4-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3
-; SSE4-NEXT: ret <4 x double> [[SHUFFLE]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @PR50392(
; AVX-NEXT: [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
- Previous message: [llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
- Next message: [llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list