[llvm] [CostModel] Add getShuffleCostImpl. (PR #145373)

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 23 10:44:20 PDT 2025


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/145373

The idea was to have getShuffleCost as the external interface, getIstructionCost would always call it, the existing implementations were moved into getShuffleCostImpl. This would allow targets to override the getShuffleCost if they wanted direct access to the costs, not via the routines that recognize rev/broadcast/etc.

Too many tests change though, as getInstructionCost and getShuffleCost can return different values at the moment. Not expecting this to be used, just putting up to show the diff.

>From d6e68094ba6a52322cde4bd65d99c63d0de030cb Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 23 Jun 2025 18:43:58 +0100
Subject: [PATCH] [CostModel] Add getShuffleCostImpl.

The idea was to have getShuffleCost as the external interface,
getIstructionCost would always call it, the existing implementations were moved
into getShuffleCostImpl. This would allow targets to override the
getShuffleCost if they wanted direct access to the costs, not via the routines
that recognize rev/broadcast/etc.

Too many tests change though, as getInstructionCost and getShuffleCost can
return different values at the moment. Justt putting up to show the diff.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   6 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 120 +---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 128 +++-
 .../AArch64/AArch64TargetTransformInfo.cpp    |  23 +-
 .../AArch64/AArch64TargetTransformInfo.h      |   9 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  21 +-
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |   9 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  17 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   9 +-
 .../Hexagon/HexagonTargetTransformInfo.cpp    |  11 +-
 .../Hexagon/HexagonTargetTransformInfo.h      |   9 +-
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  13 +-
 .../Target/PowerPC/PPCTargetTransformInfo.h   |   9 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  48 +-
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   9 +-
 .../SystemZ/SystemZTargetTransformInfo.cpp    |  15 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |   9 +-
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 122 ++--
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |   9 +-
 .../Transforms/PhaseOrdering/X86/fmaddsub.ll  |  57 +-
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 624 ++++++++++++-----
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 637 +++++++++++++-----
 .../Transforms/PhaseOrdering/X86/pr50392.ll   |  11 +-
 .../Transforms/PhaseOrdering/X86/pr94546.ll   |   7 +-
 .../AArch64/shuffletoidentity-concat.ll       |  40 +-
 .../AArch64/shuffletoidentity.ll              |  37 +-
 .../RISCV/shuffle-of-intrinsics.ll            |  12 +-
 .../VectorCombine/X86/permute-of-binops.ll    |  69 +-
 .../VectorCombine/X86/shuffle-of-binops.ll    |  19 +-
 .../VectorCombine/X86/shuffle-of-casts.ll     |   5 +-
 .../VectorCombine/X86/shuffle-of-selects.ll   |  28 +-
 31 files changed, 1354 insertions(+), 788 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 90d92e0fcf55c..06a01f2168545 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1394,6 +1394,12 @@ class TargetTransformInfo {
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0,
       VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const;
+  LLVM_ABI InstructionCost getShuffleCostImpl(
+      ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+      ArrayRef<int> Mask = {},
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0,
+      VectorType *SubTp = nullptr, ArrayRef<const Value *> Args = {},
+      const Instruction *CxtI = nullptr) const;
 
   /// Represents a hint about the context in which a cast is used.
   ///
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c22928c9bcd94..2a1f9d94a88d2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -709,6 +709,15 @@ class TargetTransformInfoImplBase {
     return InstructionCost::getInvalid();
   }
 
+  virtual InstructionCost
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const {
+    return 1;
+  }
+
   virtual InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
@@ -1532,114 +1541,11 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase {
       auto *VecTy = cast<VectorType>(U->getType());
       auto *VecSrcTy = cast<VectorType>(Operands[0]->getType());
       ArrayRef<int> Mask = Shuffle->getShuffleMask();
-      int NumSubElts, SubIndex;
-
-      // TODO: move more of this inside improveShuffleKindFromMask.
-      if (Shuffle->changesLength()) {
-        // Treat a 'subvector widening' as a free shuffle.
-        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding())
-          return 0;
-
-        if (Shuffle->isExtractSubvectorMask(SubIndex))
-          return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
-                                           VecSrcTy, Mask, CostKind, SubIndex,
-                                           VecTy, Operands, Shuffle);
-
-        if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
-          return TargetTTI->getShuffleCost(
-              TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind,
-              SubIndex,
-              FixedVectorType::get(VecTy->getScalarType(), NumSubElts),
-              Operands, Shuffle);
-
-        int ReplicationFactor, VF;
-        if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
-          APInt DemandedDstElts = APInt::getZero(Mask.size());
-          for (auto I : enumerate(Mask)) {
-            if (I.value() != PoisonMaskElem)
-              DemandedDstElts.setBit(I.index());
-          }
-          return TargetTTI->getReplicationShuffleCost(
-              VecSrcTy->getElementType(), ReplicationFactor, VF,
-              DemandedDstElts, CostKind);
-        }
-
-        bool IsUnary = isa<UndefValue>(Operands[1]);
-        NumSubElts = VecSrcTy->getElementCount().getKnownMinValue();
-        SmallVector<int, 16> AdjustMask(Mask);
-
-        // Widening shuffle - widening the source(s) to the new length
-        // (treated as free - see above), and then perform the adjusted
-        // shuffle at that width.
-        if (Shuffle->increasesLength()) {
-          for (int &M : AdjustMask)
-            M = M >= NumSubElts ? (M + (Mask.size() - NumSubElts)) : M;
-
-          return TargetTTI->getShuffleCost(
-              IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
-              VecTy, AdjustMask, CostKind, 0, nullptr, Operands, Shuffle);
-        }
-
-        // Narrowing shuffle - perform shuffle at original wider width and
-        // then extract the lower elements.
-        // FIXME: This can assume widening, which is not true of all vector
-        // architectures (and is not even the default).
-        AdjustMask.append(NumSubElts - Mask.size(), PoisonMaskElem);
-
-        InstructionCost ShuffleCost = TargetTTI->getShuffleCost(
-            IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc,
-            VecSrcTy, VecSrcTy, AdjustMask, CostKind, 0, nullptr, Operands,
-            Shuffle);
-
-        SmallVector<int, 16> ExtractMask(Mask.size());
-        std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
-        return ShuffleCost + TargetTTI->getShuffleCost(
-                                 TTI::SK_ExtractSubvector, VecTy, VecSrcTy,
-                                 ExtractMask, CostKind, 0, VecTy, {}, Shuffle);
-      }
-
-      if (Shuffle->isIdentity())
-        return 0;
+      bool IsUnary = isa<UndefValue>(Operands[1]);
 
-      if (Shuffle->isReverse())
-        return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, VecSrcTy, Mask,
-                                         CostKind, 0, nullptr, Operands,
-                                         Shuffle);
-
-      if (Shuffle->isSelect())
-        return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, VecSrcTy, Mask,
-                                         CostKind, 0, nullptr, Operands,
-                                         Shuffle);
-
-      if (Shuffle->isTranspose())
-        return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, VecSrcTy,
-                                         Mask, CostKind, 0, nullptr, Operands,
-                                         Shuffle);
-
-      if (Shuffle->isZeroEltSplat())
-        return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, VecSrcTy,
-                                         Mask, CostKind, 0, nullptr, Operands,
-                                         Shuffle);
-
-      if (Shuffle->isSingleSource())
-        return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy,
-                                         VecSrcTy, Mask, CostKind, 0, nullptr,
-                                         Operands, Shuffle);
-
-      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
-        return TargetTTI->getShuffleCost(
-            TTI::SK_InsertSubvector, VecTy, VecSrcTy, Mask, CostKind, SubIndex,
-            FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands,
-            Shuffle);
-
-      if (Shuffle->isSplice(SubIndex))
-        return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, VecSrcTy, Mask,
-                                         CostKind, SubIndex, nullptr, Operands,
-                                         Shuffle);
-
-      return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, VecSrcTy,
-                                       Mask, CostKind, 0, nullptr, Operands,
-                                       Shuffle);
+      return TargetTTI->getShuffleCost(
+          IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy,
+          VecSrcTy, Mask, CostKind, 0, nullptr, Operands, Shuffle);
     }
     case Instruction::ExtractElement: {
       auto *EEI = dyn_cast<ExtractElementInst>(U);
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0477c1b6f1a6f..2270b8118833e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -331,7 +331,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
     if (VD->isMasked()) {
       auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
-      Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
+      Cost +=
+          thisT()->getShuffleCostImpl(TargetTransformInfo::SK_Broadcast, VecTy,
                                       VecTy, {}, CostKind, 0, nullptr, {});
     }
 
@@ -1151,10 +1152,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override {
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override {
     switch (improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp)) {
     case TTI::SK_Broadcast:
       if (auto *FVT = dyn_cast<FixedVectorType>(SrcTy))
@@ -1179,6 +1181,122 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     llvm_unreachable("Unknown TTI::ShuffleKind");
   }
 
+  InstructionCost
+  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
+                 const Instruction *CxtI = nullptr) const override {
+    // TODO: move more of this inside improveShuffleKindFromMask?
+    if (auto *Shuffle = dyn_cast_if_present<ShuffleVectorInst>(CxtI)) {
+      int NumSubElts, SubIndex;
+
+      if (Shuffle->changesLength()) {
+        // Treat a 'subvector widening' as a free shuffle.
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding())
+          return 0;
+
+        if (Shuffle->isExtractSubvectorMask(SubIndex))
+          return thisT()->getShuffleCostImpl(TTI::SK_ExtractSubvector, DstTy,
+                                             SrcTy, Mask, CostKind, SubIndex,
+                                             DstTy, Args, Shuffle);
+
+        if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
+          return thisT()->getShuffleCostImpl(
+              TTI::SK_InsertSubvector, DstTy, SrcTy, Mask, CostKind, SubIndex,
+              FixedVectorType::get(DstTy->getScalarType(), NumSubElts), Args,
+              Shuffle);
+
+        int ReplicationFactor, VF;
+        if (Shuffle->isReplicationMask(ReplicationFactor, VF)) {
+          APInt DemandedDstElts = APInt::getZero(Mask.size());
+          for (auto I : enumerate(Mask)) {
+            if (I.value() != PoisonMaskElem)
+              DemandedDstElts.setBit(I.index());
+          }
+          return thisT()->getReplicationShuffleCost(SrcTy->getElementType(),
+                                                    ReplicationFactor, VF,
+                                                    DemandedDstElts, CostKind);
+        }
+
+        bool IsUnary = Args.size() < 2 || isa<UndefValue>(Args[1]);
+        NumSubElts = SrcTy->getElementCount().getKnownMinValue();
+        SmallVector<int, 16> AdjustMask(Mask);
+
+        // Widening shuffle - widening the source(s) to the new length
+        // (treated as free - see above), and then perform the adjusted
+        // shuffle at that width.
+        if (Shuffle->increasesLength()) {
+          for (int &M : AdjustMask)
+            M = M >= NumSubElts ? (M + (Mask.size() - NumSubElts)) : M;
+
+          return thisT()->getShuffleCostImpl(
+              IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, DstTy,
+              DstTy, AdjustMask, CostKind, 0, nullptr, Args, Shuffle);
+        }
+
+        // Narrowing shuffle - perform shuffle at original wider width and
+        // then extract the lower elements.
+        // FIXME: This can assume widening, which is not true of all vector
+        // architectures (and is not even the default).
+        AdjustMask.append(NumSubElts - Mask.size(), PoisonMaskElem);
+
+        InstructionCost ShuffleCost = thisT()->getShuffleCostImpl(
+            IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, SrcTy,
+            SrcTy, AdjustMask, CostKind, 0, nullptr, Args, Shuffle);
+
+        SmallVector<int, 16> ExtractMask(Mask.size());
+        std::iota(ExtractMask.begin(), ExtractMask.end(), 0);
+        return ShuffleCost + thisT()->getShuffleCostImpl(
+                                 TTI::SK_ExtractSubvector, DstTy, SrcTy,
+                                 ExtractMask, CostKind, 0, DstTy, {}, Shuffle);
+      }
+
+      if (Shuffle->isIdentity())
+        return 0;
+
+      if (Shuffle->isReverse())
+        return thisT()->getShuffleCostImpl(TTI::SK_Reverse, DstTy, SrcTy, Mask,
+                                           CostKind, 0, nullptr, Args, Shuffle);
+
+      if (Shuffle->isSelect())
+        return thisT()->getShuffleCostImpl(TTI::SK_Select, DstTy, SrcTy, Mask,
+                                           CostKind, 0, nullptr, Args, Shuffle);
+
+      if (Shuffle->isTranspose())
+        return thisT()->getShuffleCostImpl(TTI::SK_Transpose, DstTy, SrcTy,
+                                           Mask, CostKind, 0, nullptr, Args,
+                                           Shuffle);
+
+      if (Shuffle->isZeroEltSplat())
+        return thisT()->getShuffleCostImpl(TTI::SK_Broadcast, DstTy, SrcTy,
+                                           Mask, CostKind, 0, nullptr, Args,
+                                           Shuffle);
+
+      if (Shuffle->isSingleSource())
+        return thisT()->getShuffleCostImpl(TTI::SK_PermuteSingleSrc, DstTy,
+                                           SrcTy, Mask, CostKind, 0, nullptr,
+                                           Args, Shuffle);
+
+      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex))
+        return thisT()->getShuffleCostImpl(
+            TTI::SK_InsertSubvector, DstTy, SrcTy, Mask, CostKind, SubIndex,
+            FixedVectorType::get(DstTy->getScalarType(), NumSubElts), Args,
+            Shuffle);
+
+      if (Shuffle->isSplice(SubIndex))
+        return thisT()->getShuffleCostImpl(TTI::SK_Splice, DstTy, SrcTy, Mask,
+                                           CostKind, SubIndex, nullptr, Args,
+                                           Shuffle);
+
+      return thisT()->getShuffleCostImpl(TTI::SK_PermuteTwoSrc, DstTy, SrcTy,
+                                         Mask, CostKind, Index, SubTp, Args,
+                                         CxtI);
+    }
+
+    return thisT()->getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind,
+                                       Index, SubTp, Args, CxtI);
+  }
+
   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                    TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 15e38e6cb2408..235873a40a03f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5468,12 +5468,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   return Cost;
 }
 
-InstructionCost
-AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
-                               VectorType *SrcTy, ArrayRef<int> Mask,
-                               TTI::TargetCostKind CostKind, int Index,
-                               VectorType *SubTp, ArrayRef<const Value *> Args,
-                               const Instruction *CxtI) const {
+InstructionCost AArch64TTIImpl::getShuffleCostImpl(
+    TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+    ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+    VectorType *SubTp, ArrayRef<const Value *> Args,
+    const Instruction *CxtI) const {
   assert((Mask.empty() || DstTy->isScalableTy() ||
           Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
          "Expected the Mask to match the return size if given");
@@ -5565,10 +5564,10 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
       // of element moves into a new vector.
       InstructionCost NCost =
           NumSources <= 2
-              ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
-                                               : TTI::SK_PermuteTwoSrc,
-                               NTp, NTp, NMask, CostKind, 0, nullptr, Args,
-                               CxtI)
+              ? getShuffleCostImpl(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
+                                                   : TTI::SK_PermuteTwoSrc,
+                                   NTp, NTp, NMask, CostKind, 0, nullptr, Args,
+                                   CxtI)
               : LTNumElts;
       Result.first->second = NCost;
       Cost += NCost;
@@ -5798,8 +5797,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
   // Restore optimal kind.
   if (IsExtractSubvector)
     Kind = TTI::SK_ExtractSubvector;
-  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
-                               Args, CxtI);
+  return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                   SubTp, Args, CxtI);
 }
 
 static bool containsDecreasingPointers(Loop *TheLoop,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9ada70bd7086a..ea5cdafb8e8c2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -451,10 +451,11 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
 
   InstructionCost getScalarizationOverhead(
       VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5e41273556d3d..a9763c5839ad3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1182,16 +1182,15 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
 }
 
-InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *DstTy, VectorType *SrcTy,
-                                           ArrayRef<int> Mask,
-                                           TTI::TargetCostKind CostKind,
-                                           int Index, VectorType *SubTp,
-                                           ArrayRef<const Value *> Args,
-                                           const Instruction *CxtI) const {
+InstructionCost
+GCNTTIImpl::getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
   if (!isa<FixedVectorType>(SrcTy))
-    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                                 SubTp);
+    return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                     SubTp);
 
   Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
 
@@ -1241,8 +1240,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     }
   }
 
-  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                               SubTp);
+  return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                   SubTp);
 }
 
 /// Whether it is profitable to sink the operands of an
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 64a244e33f18f..cf0366263ee49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -236,10 +236,11 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   InstructionCost getVectorSplitCost() const { return 0; }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
 
   bool isProfitableToSinkOperands(Instruction *I,
                                   SmallVectorImpl<Use *> &Ops) const override;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 203fb76d7be86..333c16ac098ce 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1232,13 +1232,12 @@ InstructionCost ARMTTIImpl::getMemcpyCost(const Instruction *I) const {
   return NumOps;
 }
 
-InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *DstTy, VectorType *SrcTy,
-                                           ArrayRef<int> Mask,
-                                           TTI::TargetCostKind CostKind,
-                                           int Index, VectorType *SubTp,
-                                           ArrayRef<const Value *> Args,
-                                           const Instruction *CxtI) const {
+InstructionCost
+ARMTTIImpl::getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
   assert((Mask.empty() || DstTy->isScalableTy() ||
           Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
          "Expected the Mask to match the return size if given");
@@ -1350,8 +1349,8 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
                      ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)
                      : 1;
-  return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
-                                          Index, SubTp);
+  return BaseCost * BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask,
+                                              CostKind, Index, SubTp);
 }
 
 InstructionCost ARMTTIImpl::getArithmeticInstrCost(
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index ca06b9e3cb661..f4a147c123cb5 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -223,10 +223,11 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
   int getNumMemOps(const IntrinsicInst *I) const;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
 
   bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 9fb7d471fd22a..bb5eaa4685466 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -226,12 +226,11 @@ HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       CostKind);
 }
 
-InstructionCost
-HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
-                               VectorType *SrcTy, ArrayRef<int> Mask,
-                               TTI::TargetCostKind CostKind, int Index,
-                               VectorType *SubTp, ArrayRef<const Value *> Args,
-                               const Instruction *CxtI) const {
+InstructionCost HexagonTTIImpl::getShuffleCostImpl(
+    TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+    ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+    VectorType *SubTp, ArrayRef<const Value *> Args,
+    const Instruction *CxtI) const {
   return 1;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index af8dede723083..cb128d9b3f2d4 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,10 +123,11 @@ class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
                         unsigned AddressSpace,
                         TTI::TargetCostKind CostKind) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
                                          Align Alignment,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2fba090f2d501..7818f92ca29ad 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -603,13 +603,12 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
   return Cost * CostFactor;
 }
 
-InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *DstTy, VectorType *SrcTy,
-                                           ArrayRef<int> Mask,
-                                           TTI::TargetCostKind CostKind,
-                                           int Index, VectorType *SubTp,
-                                           ArrayRef<const Value *> Args,
-                                           const Instruction *CxtI) const {
+InstructionCost
+PPCTTIImpl::getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
 
   InstructionCost CostFactor =
       vectorCostAdjustmentFactor(Instruction::ShuffleVector, SrcTy, nullptr);
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 475472ac3720f..f33c555bc1351 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -111,10 +111,11 @@ class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> {
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                    TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 67a51c12b508e..78194208f2b0c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -452,13 +452,13 @@ static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI,
         if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
                  .second)
           return;
-        Cost += TTI.getShuffleCost(
+        Cost += TTI.getShuffleCostImpl(
             TTI::SK_PermuteSingleSrc,
             FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
             SingleOpTy, RegMask, CostKind, 0, nullptr);
       },
       [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
-        Cost += TTI.getShuffleCost(
+        Cost += TTI.getShuffleCostImpl(
             TTI::SK_PermuteTwoSrc,
             FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
             SingleOpTy, RegMask, CostKind, 0, nullptr);
@@ -529,11 +529,13 @@ costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT,
                  .second)
           return;
         ++NumShuffles;
-        Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
+        Cost +=
+            TTI.getShuffleCostImpl(TTI::SK_PermuteSingleSrc, SingleOpTy,
                                    SingleOpTy, RegMask, CostKind, 0, nullptr);
       },
       [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
-        Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+        Cost +=
+            TTI.getShuffleCostImpl(TTI::SK_PermuteTwoSrc, SingleOpTy,
                                    SingleOpTy, RegMask, CostKind, 0, nullptr);
         NumShuffles += 2;
       });
@@ -605,12 +607,11 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
-InstructionCost
-RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
-                             VectorType *SrcTy, ArrayRef<int> Mask,
-                             TTI::TargetCostKind CostKind, int Index,
-                             VectorType *SubTp, ArrayRef<const Value *> Args,
-                             const Instruction *CxtI) const {
+InstructionCost RISCVTTIImpl::getShuffleCostImpl(
+    TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+    ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+    VectorType *SubTp, ArrayRef<const Value *> Args,
+    const Instruction *CxtI) const {
   assert((Mask.empty() || DstTy->isScalableTy() ||
           Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
          "Expected the Mask to match the return size if given");
@@ -675,8 +676,8 @@ RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
             // destination vector register group for vslideup cannot overlap the
             // source.
             Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
-            Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
-                                   CostKind, InsertIndex, SubTp);
+            Cost += getShuffleCostImpl(TTI::SK_InsertSubvector, DestTp, DestTp,
+                                       {}, CostKind, InsertIndex, SubTp);
           }
           return Cost;
         }
@@ -864,8 +865,8 @@ RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
                           cast<VectorType>(SrcTy)->getElementCount());
       return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
                               TTI::CastContextHint::None, CostKind) +
-             getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
-                            nullptr) +
+             getShuffleCostImpl(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind,
+                                0, nullptr) +
              getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
                               TTI::CastContextHint::None, CostKind);
     }
@@ -912,8 +913,8 @@ RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     return FixedCost + LT.first * (GatherCost + SlideCost);
   }
   }
-  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                               SubTp);
+  return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                   SubTp);
 }
 
 static unsigned isM1OrSmaller(MVT VT) {
@@ -1039,8 +1040,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
       auto Mask = createStrideMask(Index, Factor, VF);
       Mask.resize(VF * Factor, -1);
       InstructionCost ShuffleCost =
-          getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy, VecTy,
-                         Mask, CostKind, 0, nullptr, {});
+          getShuffleCostImpl(TTI::ShuffleKind::SK_PermuteSingleSrc, VecTy,
+                             VecTy, Mask, CostKind, 0, nullptr, {});
       Cost += ShuffleCost;
     }
     return Cost;
@@ -1066,8 +1067,8 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
   // shuffle that goes into the wide store
   auto Mask = createInterleaveMask(VF, Factor);
   InstructionCost ShuffleCost =
-      getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy, Mask,
-                     CostKind, 0, nullptr, {});
+      getShuffleCostImpl(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, FVTy,
+                         Mask, CostKind, 0, nullptr, {});
   return MemCost + ShuffleCost;
 }
 
@@ -1537,9 +1538,10 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     // To support type-based query from vectorizer, set the index to 0.
     // Note that index only change the cost from vslide.vx to vslide.vi and in
     // current implementations they have same costs.
-    return getShuffleCost(TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),
-                          cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
-                          0, cast<VectorType>(ICA.getReturnType()));
+    return getShuffleCostImpl(
+        TTI::SK_Splice, cast<VectorType>(ICA.getReturnType()),
+        cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind, 0,
+        cast<VectorType>(ICA.getReturnType()));
   }
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 12bf8c1b4de70..e5b8f8b8503b1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -160,10 +160,11 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   }
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
 
   InstructionCost getScalarizationOverhead(
       VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd2bdea1..b45e34a6cdd40 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -738,12 +738,11 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
                                        Args, CxtI);
 }
 
-InstructionCost
-SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
-                               VectorType *SrcTy, ArrayRef<int> Mask,
-                               TTI::TargetCostKind CostKind, int Index,
-                               VectorType *SubTp, ArrayRef<const Value *> Args,
-                               const Instruction *CxtI) const {
+InstructionCost SystemZTTIImpl::getShuffleCostImpl(
+    TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
+    ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
+    VectorType *SubTp, ArrayRef<const Value *> Args,
+    const Instruction *CxtI) const {
   Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
   if (ST->hasVector()) {
     unsigned NumVectors = getNumVectorRegs(SrcTy);
@@ -777,8 +776,8 @@ SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
     }
   }
 
-  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                               SubTp);
+  return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                   SubTp);
 }
 
 // Return the log2 difference of the element sizes of the two vector types.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e8af009..09552cf833174 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -102,10 +102,11 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy) const;
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) const;
   unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6a05a1700f0cb..3272167926402 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1522,13 +1522,12 @@ X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
   return InstructionCost::getInvalid();
 }
 
-InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
-                                           VectorType *DstTy, VectorType *SrcTy,
-                                           ArrayRef<int> Mask,
-                                           TTI::TargetCostKind CostKind,
-                                           int Index, VectorType *SubTp,
-                                           ArrayRef<const Value *> Args,
-                                           const Instruction *CxtI) const {
+InstructionCost
+X86TTIImpl::getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                               VectorType *SrcTy, ArrayRef<int> Mask,
+                               TTI::TargetCostKind CostKind, int Index,
+                               VectorType *SubTp, ArrayRef<const Value *> Args,
+                               const Instruction *CxtI) const {
   assert((Mask.empty() || DstTy->isScalableTy() ||
           Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
          "Expected the Mask to match the return size if given");
@@ -1550,10 +1549,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   if (Kind == TTI::SK_PermuteTwoSrc &&
       Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) &&
       ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
-    return getShuffleCost(TTI::SK_InsertSubvector,
-                          VectorType::getDoubleElementsVectorType(SrcTy),
-                          VectorType::getDoubleElementsVectorType(SrcTy), Mask,
-                          CostKind, Mask.size() / 2, SrcTy);
+    return getShuffleCostImpl(TTI::SK_InsertSubvector,
+                              VectorType::getDoubleElementsVectorType(SrcTy),
+                              VectorType::getDoubleElementsVectorType(SrcTy),
+                              Mask, CostKind, Mask.size() / 2, SrcTy);
 
   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
   if (Kind == TTI::SK_Transpose)
@@ -1633,8 +1632,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            SubLT.second.getVectorNumElements());
         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
         InstructionCost ExtractCost =
-            getShuffleCost(TTI::SK_ExtractSubvector, VecTy, VecTy, {}, CostKind,
-                           ExtractIndex, SubTy);
+            getShuffleCostImpl(TTI::SK_ExtractSubvector, VecTy, VecTy, {},
+                               CostKind, ExtractIndex, SubTy);
 
         // If the original size is 32-bits or more, we can use pshufd. Otherwise
         // if we have SSSE3 we can use pshufb.
@@ -1779,9 +1778,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                 // one.
                 if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
                     PrevRegMask != RegMask)
-                  Cost +=
-                      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
-                                     SingleOpTy, RegMask, CostKind, 0, nullptr);
+                  Cost += getShuffleCostImpl(TTI::SK_PermuteSingleSrc,
+                                             SingleOpTy, SingleOpTy, RegMask,
+                                             CostKind, 0, nullptr);
                 else
                   // Just a copy of previous destination register.
                   Cost += TTI::TCC_Basic;
@@ -1798,20 +1797,21 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
             [this, SingleOpTy, CostKind,
              &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/,
                     unsigned /*Unused*/, bool /*Unused*/) {
-              Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+              Cost +=
+                  getShuffleCostImpl(TTI::SK_PermuteTwoSrc, SingleOpTy,
                                      SingleOpTy, RegMask, CostKind, 0, nullptr);
             });
         return Cost;
       }
 
       InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
-      return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
-                                            SingleOpTy, {}, CostKind, 0,
-                                            nullptr);
+      return NumOfShuffles * getShuffleCostImpl(TTI::SK_PermuteTwoSrc,
+                                                SingleOpTy, SingleOpTy, {},
+                                                CostKind, 0, nullptr);
     }
 
-    return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                                 SubTp);
+    return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                     SubTp);
   }
 
   // If we're just moving a single element around (probably as an alternative to
@@ -2278,8 +2278,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
         return LT.first * *KindCost;
   }
 
-  return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
-                               SubTp);
+  return BaseT::getShuffleCostImpl(Kind, DstTy, SrcTy, Mask, CostKind, Index,
+                                   SubTp);
 }
 
 InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -4919,8 +4919,8 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
       EVT VT = TLI->getValueType(DL, Val);
       if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
         SubTy = FixedVectorType::get(ScalarType, SubNumElts);
-      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
-                                   CostKind, 0, SubTy);
+      ShuffleCost = getShuffleCostImpl(TTI::SK_PermuteTwoSrc, SubTy, SubTy, {},
+                                       CostKind, 0, SubTy);
     }
     int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
     return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
@@ -5015,8 +5015,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
           // FIXME: we don't need to extract if all non-demanded elements
           //        are legalization-inserted padding.
           if (!LaneEltMask.isAllOnes())
-            Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
-                                   CostKind, I * NumEltsPerLane, LaneTy);
+            Cost += getShuffleCostImpl(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                       CostKind, I * NumEltsPerLane, LaneTy);
           Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert,
                                                   /*Extract*/ false, CostKind);
         }
@@ -5033,8 +5033,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
             if (!AffectedLanes[I] ||
                 (Lane == 0 && FullyAffectedLegalVectors[LegalVec]))
               continue;
-            Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, Ty, {},
-                                   CostKind, I * NumEltsPerLane, LaneTy);
+            Cost += getShuffleCostImpl(TTI::SK_InsertSubvector, Ty, Ty, {},
+                                       CostKind, I * NumEltsPerLane, LaneTy);
           }
         }
       }
@@ -5093,8 +5093,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(
               NumEltsPerLane, I * NumEltsPerLane);
           if (LaneEltMask.isZero())
             continue;
-          Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {}, CostKind,
-                                 I * NumEltsPerLane, LaneTy);
+          Cost += getShuffleCostImpl(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                     CostKind, I * NumEltsPerLane, LaneTy);
           Cost += BaseT::getScalarizationOverhead(
               LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind);
         }
@@ -5211,10 +5211,10 @@ X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
       DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors);
   unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount();
 
-  InstructionCost SingleShuffleCost =
-      getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
-                     /*Mask=*/{}, CostKind,
-                     /*Index=*/0, /*SubTp=*/nullptr);
+  InstructionCost SingleShuffleCost = getShuffleCostImpl(
+      TTI::SK_PermuteSingleSrc, SingleDstVecTy, SingleDstVecTy,
+      /*Mask=*/{}, CostKind,
+      /*Index=*/0, /*SubTp=*/nullptr);
   return NumDstVectorsDemanded * SingleShuffleCost;
 }
 
@@ -5355,10 +5355,10 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
         SubVecEltsLeft += CurrVecTy->getNumElements();
         // And that's free only for the 0'th subvector of a legalized vector.
         if (!Is0thSubVec)
-          Cost +=
-              getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
-                                    : TTI::ShuffleKind::SK_ExtractSubvector,
-                             VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
+          Cost += getShuffleCostImpl(
+              IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+                     : TTI::ShuffleKind::SK_ExtractSubvector,
+              VTy, VTy, {}, CostKind, NumEltDone(), CurrVecTy);
       }
 
       // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
@@ -5434,17 +5434,17 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
   if (VT.isSimple() && Ty != VT.getSimpleVT() &&
       LT.second.getVectorNumElements() == NumElem)
     // Promotion requires extend/truncate for data and a shuffle for mask.
-    Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {}, CostKind,
-                           0, nullptr) +
-            getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {}, CostKind,
-                           0, nullptr);
+    Cost += getShuffleCostImpl(TTI::SK_PermuteTwoSrc, SrcVTy, SrcVTy, {},
+                               CostKind, 0, nullptr) +
+            getShuffleCostImpl(TTI::SK_PermuteTwoSrc, MaskTy, MaskTy, {},
+                               CostKind, 0, nullptr);
 
   else if (LT.first * Ty.getVectorNumElements() > NumElem) {
     auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
                                            Ty.getVectorNumElements());
     // Expanding requires fill mask with zeroes
-    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
-                           CostKind, 0, MaskTy);
+    Cost += getShuffleCostImpl(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy,
+                               {}, CostKind, 0, MaskTy);
   }
 
   // Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
@@ -5708,8 +5708,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
     // If we're reducing from 256/512 bits, use an extract_subvector.
     if (Size > 128) {
       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
-      ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
-                                      CostKind, NumVecElts, SubTy);
+      ReductionCost += getShuffleCostImpl(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                          CostKind, NumVecElts, SubTy);
       Ty = SubTy;
     } else if (Size == 128) {
       // Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -5720,8 +5720,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
       else
         ShufTy =
             FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
-      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
-                                      {}, CostKind, 0, nullptr);
+      ReductionCost += getShuffleCostImpl(TTI::SK_PermuteSingleSrc, ShufTy,
+                                          ShufTy, {}, CostKind, 0, nullptr);
     } else if (Size == 64) {
       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
       FixedVectorType *ShufTy;
@@ -5731,8 +5731,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
       else
         ShufTy =
             FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
-      ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
-                                      {}, CostKind, 0, nullptr);
+      ReductionCost += getShuffleCostImpl(TTI::SK_PermuteSingleSrc, ShufTy,
+                                          ShufTy, {}, CostKind, 0, nullptr);
     } else {
       // Reducing from smaller size is a shift by immediate.
       auto *ShiftTy = FixedVectorType::get(
@@ -5890,8 +5890,8 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
     // If we're reducing from 256/512 bits, use an extract_subvector.
     if (Size > 128) {
       auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
-      MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, Ty, {},
-                                   CostKind, NumVecElts, SubTy);
+      MinMaxCost += getShuffleCostImpl(TTI::SK_ExtractSubvector, Ty, Ty, {},
+                                       CostKind, NumVecElts, SubTy);
       Ty = SubTy;
     } else if (Size == 128) {
       // Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -5901,8 +5901,8 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
             FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
       else
         ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
-      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
-                                   CostKind, 0, nullptr);
+      MinMaxCost += getShuffleCostImpl(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
+                                       {}, CostKind, 0, nullptr);
     } else if (Size == 64) {
       // Reducing from 64 bits is a shuffle of v4f32/v4i32.
       FixedVectorType *ShufTy;
@@ -5910,8 +5910,8 @@ X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy,
         ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
       else
         ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
-      MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy, {},
-                                   CostKind, 0, nullptr);
+      MinMaxCost += getShuffleCostImpl(TTI::SK_PermuteSingleSrc, ShufTy, ShufTy,
+                                       {}, CostKind, 0, nullptr);
     } else {
       // Reducing from smaller size is a shift by immediate.
       auto *ShiftTy = FixedVectorType::get(
@@ -6696,7 +6696,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
     TTI::ShuffleKind ShuffleKind =
         (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
 
-    InstructionCost ShuffleCost = getShuffleCost(
+    InstructionCost ShuffleCost = getShuffleCostImpl(
         ShuffleKind, SingleMemOpTy, SingleMemOpTy, {}, CostKind, 0, nullptr);
 
     unsigned NumOfLoadsInInterleaveGrp =
@@ -6754,8 +6754,8 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
   // shuffle.
   unsigned NumOfSources = Factor; // The number of values to be merged.
   InstructionCost ShuffleCost =
-      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy, {},
-                     CostKind, 0, nullptr);
+      getShuffleCostImpl(TTI::SK_PermuteTwoSrc, SingleMemOpTy, SingleMemOpTy,
+                         {}, CostKind, 0, nullptr);
   unsigned NumOfShufflesPerStore = NumOfSources - 1;
 
   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index bc06c4746c3c4..ed92d9737fe10 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -149,10 +149,11 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
                                   TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
-  getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
-                 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
-                 VectorType *SubTp, ArrayRef<const Value *> Args = {},
-                 const Instruction *CxtI = nullptr) const override;
+  getShuffleCostImpl(TTI::ShuffleKind Kind, VectorType *DstTy,
+                     VectorType *SrcTy, ArrayRef<int> Mask,
+                     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
+                     ArrayRef<const Value *> Args = {},
+                     const Instruction *CxtI = nullptr) const override;
   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                    TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 5cb2c4530aa57..11f30a7c7feb3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -556,35 +556,22 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double>
 }
 
 define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
-; SSE2-LABEL: @buildvector_mul_subadd_ps256(
-; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE2-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; SSE2-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; SSE2-NEXT:    ret <8 x float> [[TMP4]]
-;
-; SSE4-LABEL: @buildvector_mul_subadd_ps256(
-; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
-; SSE4-NEXT:    ret <8 x float> [[TMP6]]
+; SSE-LABEL: @buildvector_mul_subadd_ps256(
+; SSE-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256(
 ; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA4-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B]]
 ; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
-; AVX_FMA4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; AVX_FMA4-NEXT:    ret <8 x float> [[TMP6]]
 ;
 ; AVX_FMA3-LABEL: @buildvector_mul_subadd_ps256(
@@ -677,13 +664,11 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_ps512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B]]
 ; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
-; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP2]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
 ; AVX_FMA-NEXT:    ret <16 x float> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_ps512(
@@ -880,13 +865,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_pd512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
+; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B]]
 ; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
-; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; AVX_FMA-NEXT:    ret <8 x double> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_pd512(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 9bfd92ef35a46..f265fed11e356 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -61,27 +61,48 @@ define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @add_v8i16_u1234567(
 ; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP5]], [[TMP8]]
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD1:%.*]] = add <8 x i16> [[TMP9]], [[TMP10]]
 ; SSE2-NEXT:    [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8i16_u1234567(
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 5, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 4, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP4]], [[TMP8]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+; SSE4-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP9]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP10]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8i16_u1234567(
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 5, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 4, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 5, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 4, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP4]], [[TMP8]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP9]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP10]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    ret <8 x i16> [[RESULT]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -136,9 +157,13 @@ define <8 x i16> @add_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8i16_76u43210(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[HADD22:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 3, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 2, i32 5>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD0:%.*]] = add <8 x i16> [[A]], [[TMP7]]
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD22:%.*]] = shufflevector <8 x i16> [[HADD0]], <8 x i16> [[TMP8]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP3]], [[TMP4]]
@@ -146,9 +171,13 @@ define <8 x i16> @add_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
 ; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v8i16_76u43210(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[HADD22:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 3, i32 4>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 2, i32 5>
+; AVX-NEXT:    [[TMP6:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HADD0:%.*]] = add <8 x i16> [[A]], [[TMP7]]
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HADD22:%.*]] = shufflevector <8 x i16> [[HADD0]], <8 x i16> [[TMP8]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP3]], [[TMP4]]
@@ -274,38 +303,62 @@ define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
 ; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <2 x i32> <i32 9, i32 10>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <2 x i32> <i32 8, i32 11>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23>
+; SSE2-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP11:%.*]] = add <4 x i16> [[TMP7]], [[TMP8]]
+; SSE2-NEXT:    [[HADD8:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADDB:%.*]] = insertelement <16 x i16> [[HADD92]], i16 [[BEF]], i64 11
-; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
+; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE4-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[A89]], i64 8
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT:    ret <16 x i16> [[TMP9]]
+; SSE4-NEXT:    [[HADDB2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB2]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; AVX2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 9, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 8, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP1:%.*]], <16 x i16> poison, <2 x i32> <i32 9, i32 10>
+; AVX2-NEXT:    [[TMP13:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <2 x i32> <i32 8, i32 11>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP4]], [[TMP13]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>
 ; AVX2-NEXT:    [[RESULT:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    ret <16 x i16> [[RESULT]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADDB:%.*]] = shufflevector <16 x i16> [[HADD92]], <16 x i16> [[RESULT]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:    ret <16 x i16> [[RESULT1]]
 ;
 ; AVX512-LABEL: @add_v16i16_0123u56789uBCDEF(
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
@@ -383,20 +436,24 @@ define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_FEuCBA98765432u0(
+; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE2-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
 ; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
 ; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
 ; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <4 x i32> <i32 10, i32 12, i32 14, i32 24>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <4 x i32> <i32 11, i32 13, i32 15, i32 25>
+; SSE2-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
 ; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[A89]], i64 8
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADDC1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDC1]], i16 [[BCD]], i64 14
 ; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
@@ -404,23 +461,37 @@ define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE4-NEXT:    [[A89:%.*]] = add i16 [[A8]], [[A9]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 11, i32 12>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 10, i32 13>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD8:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[A89]], i64 8
+; SSE4-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADDA2:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> <i32 3, i32 2, i32 poison, i32 1, i32 0, i32 27, i32 26, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HADDA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
 ; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; AVX2-LABEL: @add_v16i16_FEuCBA98765432u0(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 9, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 8, i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[HADDA:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <2 x i32> <i32 9, i32 10>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 8, i32 11>
+; AVX2-NEXT:    [[TMP9:%.*]] = add <2 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 13, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP10:%.*]] = add <16 x i16> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP11]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADDA:%.*]] = shufflevector <16 x i16> [[HADD92]], <16 x i16> [[TMP10]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 28, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP3]], [[TMP4]]
@@ -537,11 +608,44 @@ define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @add_v4i32_u123(
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE2-LABEL: @add_v4i32_u123(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4i32_u123(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @add_v4i32_u123(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @add_v4i32_u123(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 5, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 4, i32 7>
+; AVX512-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -564,11 +668,41 @@ define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @add_v4i32_0u23(
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE2-LABEL: @add_v4i32_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[A]], [[TMP4]]
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4i32_0u23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[A]], [[TMP4]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @add_v4i32_0u23(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[A]], [[TMP4]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @add_v4i32_0u23(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 5, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 4, i32 7>
+; AVX512-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -598,16 +732,24 @@ define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4i32_01u3(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP1]], [[TMP6]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
 ;
 ; AVX2-LABEL: @add_v4i32_01u3(
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 1, i32 2, i32 poison, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP1]], [[TMP6]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
 ;
 ; AVX512-LABEL: @add_v4i32_01u3(
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
@@ -663,11 +805,32 @@ define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @add_v4i32_uu23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
-; CHECK-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; SSE2-LABEL: @add_v4i32_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4i32_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @add_v4i32_uu23(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @add_v4i32_uu23(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; AVX512-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -690,11 +853,32 @@ define <4 x i32> @add_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @add_v4i32_01uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+; SSE2-LABEL: @add_v4i32_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @add_v4i32_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @add_v4i32_01uu(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @add_v4i32_01uu(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -724,16 +908,24 @@ define <4 x i32> @add_v4i32_32u0(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4i32_32u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[RESULT:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 0>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
+; SSE4-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[RESULT]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; AVX2-LABEL: @add_v4i32_32u0(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[RESULT:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 0>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
+; AVX2-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[RESULT]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; AVX512-LABEL: @add_v4i32_32u0(
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
@@ -809,11 +1001,53 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @add_v8i32_01234u67(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
-; CHECK-NEXT:    [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-LABEL: @add_v8i32_01234u67(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX2-LABEL: @add_v8i32_01234u67(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADD4:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX512-LABEL: @add_v8i32_01234u67(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
+; AVX512-NEXT:    [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x i32> [[RESULT]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -1122,16 +1356,32 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8f32_012u4567(
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[TMP8]], [[TMP5]]
-; SSE4-NEXT:    ret <8 x float> [[TMP9]]
-;
-; AVX-LABEL: @add_v8f32_012u4567(
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
-; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    ret <8 x float> [[TMP7]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX2-LABEL: @add_v8f32_012u4567(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADD5:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v8f32_012u4567(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
+; AVX512-NEXT:    [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[RESULT]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -1187,16 +1437,41 @@ define <8 x float> @add_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v8f32_76u43210(
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 4, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
-; SSE4-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
+; SSE4-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
+; SSE4-NEXT:    [[B01:%.*]] = fadd float [[B0]], [[B1]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE4-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x float> [[TMP7]], float [[B01]], i64 4
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[HADD4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; SSE4-NEXT:    ret <8 x float> [[TMP6]]
 ;
-; AVX-LABEL: @add_v8f32_76u43210(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 4, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
-; AVX-NEXT:    [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[RESULT]]
+; AVX2-LABEL: @add_v8f32_76u43210(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; AVX2-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> [[HADD4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
+; AVX2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v8f32_76u43210(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 5, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 4, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
+; AVX512-NEXT:    [[RESULT:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[RESULT]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -1325,28 +1600,11 @@ define <4 x double> @add_v4f64_0123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
-; SSE2-LABEL: @add_v4f64_u123(
-; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE2-NEXT:    ret <4 x double> [[RESULT]]
-;
-; SSE4-LABEL: @add_v4f64_u123(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT:    ret <4 x double> [[TMP3]]
-;
-; AVX-LABEL: @add_v4f64_u123(
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-LABEL: @add_v4f64_u123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; CHECK-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1369,28 +1627,11 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
-; SSE2-LABEL: @add_v4f64_0u23(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE2-NEXT:    ret <4 x double> [[RESULT]]
-;
-; SSE4-LABEL: @add_v4f64_0u23(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    ret <4 x double> [[TMP4]]
-;
-; AVX-LABEL: @add_v4f64_0u23(
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-LABEL: @add_v4f64_0u23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
+; CHECK-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1425,9 +1666,14 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v4f64_01u3(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @add_v4f64_01u3(
@@ -1469,9 +1715,14 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v4f64_012u(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @add_v4f64_012u(
@@ -1509,9 +1760,10 @@ define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; SSE4-LABEL: @add_v4f64_uu23(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
 ; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; AVX-LABEL: @add_v4f64_uu23(
@@ -1549,9 +1801,10 @@ define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[TMP4]]
 ;
 ; SSE4-LABEL: @add_v4f64_01uu(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @add_v4f64_01uu(
@@ -1593,16 +1846,31 @@ define <4 x double> @add_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v4f64_32u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
-; SSE4-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE4-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE4-NEXT:    [[A01:%.*]] = fadd double [[A0]], [[A1]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A01]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
-; AVX-LABEL: @add_v4f64_32u0(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
-; AVX-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX2-LABEL: @add_v4f64_32u0(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
+; AVX2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; AVX2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4f64_32u0(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; AVX512-NEXT:    [[RESULT:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 13b4d7da97c9d..95b57e446564d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -59,29 +59,51 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
 
 define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @sub_v8i16_u1234567(
-; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 2, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 3, i32 5>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], [[TMP8]]
+; SSE2-NEXT:    [[HSUB22:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <8 x i32> <i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB22]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v8i16_u1234567(
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 4, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 5, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP11:%.*]] = sub <4 x i16> [[TMP4]], [[TMP8]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
-; SSE4-NEXT:    ret <8 x i16> [[TMP7]]
+; SSE4-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB32:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP9]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB32]], <8 x i16> [[TMP10]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; AVX-LABEL: @sub_v8i16_u1234567(
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 4, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 5, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX-NEXT:    [[TMP11:%.*]] = sub <4 x i16> [[TMP4]], [[TMP8]]
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    ret <8 x i16> [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HSUB32:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> [[TMP9]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB32]], <8 x i16> [[TMP10]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:    ret <8 x i16> [[RESULT]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -136,9 +158,13 @@ define <8 x i16> @sub_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v8i16_76u43210(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 2, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 3, i32 5>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB0:%.*]] = sub <8 x i16> [[A]], [[TMP7]]
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB22:%.*]] = shufflevector <8 x i16> [[HSUB0]], <8 x i16> [[TMP8]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP3]], [[TMP4]]
@@ -146,9 +172,13 @@ define <8 x i16> @sub_v8i16_76u43210(<8 x i16> %a, <8 x i16> %b) {
 ; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
 ;
 ; AVX-LABEL: @sub_v8i16_76u43210(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[HSUB22:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <2 x i32> <i32 2, i32 4>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <2 x i32> <i32 3, i32 5>
+; AVX-NEXT:    [[TMP6:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HSUB0:%.*]] = sub <8 x i16> [[A]], [[TMP7]]
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[HSUB22:%.*]] = shufflevector <8 x i16> [[HSUB0]], <8 x i16> [[TMP8]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B:%.*]], <8 x i32> <i32 6, i32 8, i32 poison, i32 12, i32 14, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 7, i32 9, i32 poison, i32 13, i32 15, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP3]], [[TMP4]]
@@ -274,38 +304,68 @@ define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
 ; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <2 x i32> <i32 8, i32 10>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <2 x i32> <i32 9, i32 11>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23>
+; SSE2-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP11:%.*]] = sub <4 x i16> [[TMP7]], [[TMP8]]
+; SSE2-NEXT:    [[HSUB8:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUBB:%.*]] = insertelement <16 x i16> [[HSUB92]], i16 [[BEF]], i64 11
-; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i16> [[TMP11]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
+; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE4-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[A89]], i64 8
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 10, i32 poison, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 11, i32 poison, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT:    ret <16 x i16> [[TMP9]]
+; SSE4-NEXT:    [[HSUBB2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB2]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
 ;
-; AVX-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 9, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[RESULT:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[RESULT]]
+; AVX2-LABEL: @sub_v16i16_0123u56789uBCDEF(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <2 x i32> <i32 8, i32 10>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 9, i32 11>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <16 x i16> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B:%.*]], <16 x i16> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <4 x i32> <i32 9, i32 11, i32 13, i32 15>
+; AVX2-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP10:%.*]] = sub <16 x i16> [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUBB:%.*]] = shufflevector <16 x i16> [[HSUB92]], <16 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i16> [[TMP7]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:    ret <16 x i16> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v16i16_0123u56789uBCDEF(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 9, i32 11, i32 poison, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX512-NEXT:    [[RESULT:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <16 x i16> [[RESULT]]
 ;
   %a0 = extractelement <16 x i16> %a, i32 0
   %a1 = extractelement <16 x i16> %a, i32 1
@@ -377,20 +437,24 @@ define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_FEuCBA98765432u0(
+; SSE2-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE2-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE2-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
 ; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
 ; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
 ; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <4 x i32> <i32 10, i32 12, i32 14, i32 24>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <4 x i32> <i32 11, i32 13, i32 15, i32 25>
+; SSE2-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP4]], [[TMP5]]
 ; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 12, i32 14, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 13, i32 15, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[A89]], i64 8
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUBC1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBC1]], i16 [[BCD]], i64 14
 ; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
@@ -398,23 +462,37 @@ define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i64 8
+; SSE4-NEXT:    [[A9:%.*]] = extractelement <16 x i16> [[A]], i64 9
+; SSE4-NEXT:    [[A89:%.*]] = sub i16 [[A8]], [[A9]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 10, i32 12>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 11, i32 13>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB8:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[A89]], i64 8
+; SSE4-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUBA2:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP11]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
-; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> <i32 3, i32 2, i32 poison, i32 1, i32 0, i32 27, i32 26, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[HSUBA2]], <16 x i32> <i32 4, i32 3, i32 poison, i32 1, i32 0, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 poison, i32 16>
 ; SSE4-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; AVX2-LABEL: @sub_v16i16_FEuCBA98765432u0(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[HSUBA:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> poison, <2 x i32> <i32 8, i32 10>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <2 x i32> <i32 9, i32 11>
+; AVX2-NEXT:    [[TMP9:%.*]] = sub <2 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 13, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP10:%.*]] = sub <16 x i16> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP11]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> [[TMP8]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUBA:%.*]] = shufflevector <16 x i16> [[HSUB92]], <16 x i16> [[TMP10]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 28, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 poison, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 15, i32 25, i32 poison, i32 29, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP3]], [[TMP4]]
@@ -531,11 +609,44 @@ define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @sub_v4i32_u123(
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE2-LABEL: @sub_v4i32_u123(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4i32_u123(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @sub_v4i32_u123(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 poison, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 poison, i32 1, i32 4, i32 5>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @sub_v4i32_u123(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; AVX512-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -558,11 +669,41 @@ define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @sub_v4i32_0u23(
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE2-LABEL: @sub_v4i32_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[A]], [[TMP4]]
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4i32_0u23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[A]], [[TMP4]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @sub_v4i32_0u23(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[A]], [[TMP4]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @sub_v4i32_0u23(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; AVX512-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -592,16 +733,24 @@ define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
 ;
 ; SSE4-LABEL: @sub_v4i32_01u3(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP1]], [[TMP6]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; SSE4-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
 ;
 ; AVX2-LABEL: @sub_v4i32_01u3(
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP1]], [[TMP6]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; AVX2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
-; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
 ;
 ; AVX512-LABEL: @sub_v4i32_01u3(
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
@@ -657,11 +806,32 @@ define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @sub_v4i32_uu23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
-; CHECK-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+; SSE2-LABEL: @sub_v4i32_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4i32_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX2-LABEL: @sub_v4i32_uu23(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; AVX512-LABEL: @sub_v4i32_uu23(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; AVX512-NEXT:    [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -684,11 +854,32 @@ define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @sub_v4i32_01uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+; SSE2-LABEL: @sub_v4i32_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01uu(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01uu(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
@@ -718,16 +909,24 @@ define <4 x i32> @sub_v4i32_32u0(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; SSE4-LABEL: @sub_v4i32_32u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[RESULT:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 0>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
+; SSE4-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP6]], [[TMP3]]
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[RESULT]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; AVX2-LABEL: @sub_v4i32_32u0(
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[RESULT:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> <i32 2, i32 0>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
+; AVX2-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP6]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[RESULT]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT1]]
 ;
 ; AVX512-LABEL: @sub_v4i32_32u0(
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
@@ -803,11 +1002,53 @@ define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @sub_v8i32_01234u67(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
-; CHECK-NEXT:    [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = sub <8 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE2-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT:    [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB4:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE4-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 6>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 5, i32 7>
+; SSE4-NEXT:    [[TMP6:%.*]] = sub <2 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX2-LABEL: @sub_v8i32_01234u67(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 4, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 5, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUB4:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v8i32_01234u67(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX512-NEXT:    [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x i32> [[RESULT]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -1116,16 +1357,32 @@ define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v8f32_012u4567(
-; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP9:%.*]] = fsub <8 x float> [[TMP8]], [[TMP5]]
-; SSE4-NEXT:    ret <8 x float> [[TMP9]]
-;
-; AVX-LABEL: @sub_v8f32_012u4567(
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
-; AVX-NEXT:    ret <8 x float> [[TMP7]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX2-LABEL: @sub_v8f32_012u4567(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUB5:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v8f32_012u4567(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX512-NEXT:    [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[RESULT]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -1181,16 +1438,41 @@ define <8 x float> @sub_v8f32_76u43210(<8 x float> %a, <8 x float> %b) {
 ; SSE2-NEXT:    ret <8 x float> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v8f32_76u43210(
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 5, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
-; SSE4-NEXT:    [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i64 0
+; SSE4-NEXT:    [[B1:%.*]] = extractelement <8 x float> [[B]], i64 1
+; SSE4-NEXT:    [[B01:%.*]] = fsub float [[B0]], [[B1]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE4-NEXT:    [[TMP9:%.*]] = fsub <2 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[HSUB4:%.*]] = insertelement <8 x float> [[TMP7]], float [[B01]], i64 4
+; SSE4-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[HSUB4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
 ; SSE4-NEXT:    ret <8 x float> [[TMP6]]
 ;
-; AVX-LABEL: @sub_v8f32_76u43210(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 5, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
-; AVX-NEXT:    [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[RESULT]]
+; AVX2-LABEL: @sub_v8f32_76u43210(
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX2-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <8 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; AVX2-NEXT:    [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[HSUB4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> [[HSUB4]], <8 x i32> <i32 1, i32 0, i32 poison, i32 12, i32 11, i32 10, i32 9, i32 8>
+; AVX2-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v8f32_76u43210(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> [[A:%.*]], <8 x i32> <i32 6, i32 4, i32 poison, i32 0, i32 14, i32 12, i32 10, i32 8>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> [[A]], <8 x i32> <i32 7, i32 5, i32 poison, i32 1, i32 15, i32 13, i32 11, i32 9>
+; AVX512-NEXT:    [[RESULT:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[RESULT]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -1319,28 +1601,11 @@ define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
-; SSE2-LABEL: @sub_v4f64_u123(
-; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE2-NEXT:    ret <4 x double> [[RESULT]]
-;
-; SSE4-LABEL: @sub_v4f64_u123(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT:    ret <4 x double> [[TMP3]]
-;
-; AVX-LABEL: @sub_v4f64_u123(
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
-; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-LABEL: @sub_v4f64_u123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; CHECK-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1363,28 +1628,11 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
-; SSE2-LABEL: @sub_v4f64_0u23(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; SSE2-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE2-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE2-NEXT:    ret <4 x double> [[RESULT]]
-;
-; SSE4-LABEL: @sub_v4f64_0u23(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
-; SSE4-NEXT:    ret <4 x double> [[TMP4]]
-;
-; AVX-LABEL: @sub_v4f64_0u23(
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
-; AVX-NEXT:    ret <4 x double> [[TMP4]]
+; CHECK-LABEL: @sub_v4f64_0u23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; CHECK-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -1419,9 +1667,14 @@ define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v4f64_01u3(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @sub_v4f64_01u3(
@@ -1463,9 +1716,14 @@ define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v4f64_012u(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @sub_v4f64_012u(
@@ -1503,9 +1761,10 @@ define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; SSE4-LABEL: @sub_v4f64_uu23(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
-; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
-; SSE4-NEXT:    [[RESULT1:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
 ; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
 ;
 ; AVX-LABEL: @sub_v4f64_uu23(
@@ -1543,9 +1802,10 @@ define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[TMP4]]
 ;
 ; SSE4-LABEL: @sub_v4f64_01uu(
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE4-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE4-NEXT:    ret <4 x double> [[TMP3]]
 ;
 ; AVX-LABEL: @sub_v4f64_01uu(
@@ -1587,16 +1847,31 @@ define <4 x double> @sub_v4f64_32u0(<4 x double> %a, <4 x double> %b) {
 ; SSE2-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v4f64_32u0(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
-; SSE4-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; SSE4-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; SSE4-NEXT:    [[A01:%.*]] = fsub double [[A0]], [[A1]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
+; SSE4-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A01]], i64 3
 ; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
-; AVX-LABEL: @sub_v4f64_32u0(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
-; AVX-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[RESULT]]
+; AVX2-LABEL: @sub_v4f64_32u0(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 2, i32 0>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 3, i32 1>
+; AVX2-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 4>
+; AVX2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v4f64_32u0(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 poison, i32 4>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 3, i32 1, i32 poison, i32 5>
+; AVX512-NEXT:    [[RESULT:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x double> [[RESULT]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index 842cf42505673..d4d6b5e36be83 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -10,14 +10,9 @@
 
 define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
 ; SSE2-LABEL: @PR50392(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE2-NEXT:    [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE2-NEXT:    [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE2-NEXT:    [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
-; SSE2-NEXT:    [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; SSE2-NEXT:    [[SHUFFLE:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    ret <4 x double> [[SHUFFLE]]
 ;
 ; SSE4-LABEL: @PR50392(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
index 354a988416c3e..2c25f06ec6fea 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
@@ -10,10 +10,9 @@
 
 define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
 ; SSE2-LABEL: @PR94546(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    ret <4 x double> [[TMP4]]
 ;
 ; SSE4-LABEL: @PR94546(
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
index 7aba1bbb1c9a0..79be1d2418841 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity-concat.ll
@@ -136,28 +136,22 @@ define <16 x i32> @concata_addmul_bigger(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i3
 
 define <16 x i32> @concata_addmul_bigger_undef(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <4 x i32> %a4a, <16 x i32> %b, <16 x i32> %c) {
 ; CHECK-LABEL: @concata_addmul_bigger_undef(
-; CHECK-NEXT:    [[A1:%.*]] = shufflevector <4 x i32> [[A1A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[A2:%.*]] = shufflevector <4 x i32> [[A2A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[A3:%.*]] = shufflevector <4 x i32> [[A3A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[A4:%.*]] = shufflevector <4 x i32> [[A4A:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[B1:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:    [[B2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[B3:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[B4:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[C1:%.*]] = shufflevector <16 x i32> [[C:%.*]], <16 x i32> poison, <4 x i32> <i32 15, i32 14, i32 13, i32 12>
 ; CHECK-NEXT:    [[C2:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 11, i32 10, i32 9, i32 8>
-; CHECK-NEXT:    [[C3:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[C4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[X1:%.*]] = mul <4 x i32> [[A1]], [[B1]]
-; CHECK-NEXT:    [[X2:%.*]] = mul <4 x i32> [[A2]], [[B2]]
-; CHECK-NEXT:    [[X3:%.*]] = mul <4 x i32> [[A3]], [[B3]]
-; CHECK-NEXT:    [[X4:%.*]] = mul <4 x i32> [[A4]], [[B4]]
+; CHECK-NEXT:    [[X1:%.*]] = mul <4 x i32> [[A3]], [[B1]]
+; CHECK-NEXT:    [[X2:%.*]] = mul <4 x i32> [[A4]], [[B2]]
 ; CHECK-NEXT:    [[Y1:%.*]] = add <4 x i32> [[X1]], [[C1]]
 ; CHECK-NEXT:    [[Y2:%.*]] = add <4 x i32> [[X2]], [[C2]]
-; CHECK-NEXT:    [[Y3:%.*]] = add <4 x i32> [[X3]], [[C3]]
-; CHECK-NEXT:    [[Y4:%.*]] = add <4 x i32> [[X4]], [[C4]]
 ; CHECK-NEXT:    [[CC1:%.*]] = shufflevector <4 x i32> [[Y1]], <4 x i32> [[Y2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 7>
-; CHECK-NEXT:    [[CC2:%.*]] = shufflevector <4 x i32> [[Y3]], <4 x i32> [[Y4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A3A1:%.*]], <4 x i32> [[A4A1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[C]], <16 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[CC2:%.*]] = add <8 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[CC1]], <8 x i32> [[CC2]], <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -275,15 +269,17 @@ define <16 x i32> @two_concats(<4 x i32> %a1a, <4 x i32> %a2a, <4 x i32> %a3a, <
 
 define <16 x double> @konkat(<16 x double> %wide.vec, <16 x double> %wide.vec115, <2 x double> %l27, <2 x double> %l28, <2 x double> %l29, <2 x double> %l30) {
 ; CHECK-LABEL: @konkat(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[L27:%.*]], <2 x double> [[L28:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[L29:%.*]], <2 x double> [[L30:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[L27]], <2 x double> [[L28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[L29]], <2 x double> [[L30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc nsz contract <16 x double> [[WIDE_VEC:%.*]], [[TMP7]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd reassoc nsz contract <16 x double> [[WIDE_VEC115:%.*]], [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x double> [[WIDE_VEC115:%.*]], <16 x double> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x double> [[WIDE_VEC:%.*]], <16 x double> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[L27:%.*]], <2 x double> [[L28:%.*]], <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul reassoc nsz contract <8 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd reassoc nsz contract <8 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x double> [[WIDE_VEC115]], <16 x double> poison, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[L29:%.*]], <2 x double> [[L30:%.*]], <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul reassoc nsz contract <8 x double> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz contract <8 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> [[TMP10]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    ret <16 x double> [[INTERLEAVED_VEC]]
 ;
   %broadcast.splat = shufflevector <2 x double> %l27, <2 x double> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 1c128c8f56a03..c4ec25ec11fb0 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -64,13 +64,9 @@ define <8 x i8> @wrong_addsub(<8 x i8> %a, <8 x i8> %b) {
 ; Different lanes that do not make an identity
 define <8 x i8> @wrong_lanes(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @wrong_lanes(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -305,11 +301,7 @@ define <8 x i8> @constantsplat(<8 x i8> %a) {
 
 define <8 x i8> @constantdiff(<8 x i8> %a) {
 ; CHECK-LABEL: @constantdiff(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], <i8 1, i8 2, i8 3, i8 4>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], <i8 5, i8 6, i8 7, i8 8>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[A:%.*]], <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -322,11 +314,7 @@ define <8 x i8> @constantdiff(<8 x i8> %a) {
 
 define <8 x i8> @constantdiff2(<8 x i8> %a) {
 ; CHECK-LABEL: @constantdiff2(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = add <4 x i8> [[AT]], <i8 1, i8 2, i8 3, i8 4>
-; CHECK-NEXT:    [[ABB:%.*]] = add <4 x i8> [[AB]], <i8 1, i8 2, i8 3, i8 4>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = add <8 x i8> [[A:%.*]], <i8 4, i8 3, i8 2, i8 1, i8 4, i8 3, i8 2, i8 1>
 ; CHECK-NEXT:    ret <8 x i8> [[R]]
 ;
   %ab = shufflevector <8 x i8> %a, <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -839,15 +827,24 @@ define void @v8f64interleave(i64 %0, ptr %1, ptr %x, double %z) {
 ; CHECK-LABEL: @v8f64interleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[Z:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x double>, ptr [[TMP1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[X:%.*]], i64 [[TMP0:%.*]]
 ; CHECK-NEXT:    [[WIDE_VEC34:%.*]] = load <16 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -56
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <16 x double> [[WIDE_VEC]], [[TMP6]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x double> [[WIDE_VEC34]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <8 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x double> [[WIDE_VEC34]], <16 x double> poison, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x double> [[WIDE_VEC]], <16 x double> poison, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <8 x double> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <8 x double> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> [[TMP15]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    store <16 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
index f3e5d273e88cc..3a3f6759d56ac 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/shuffle-of-intrinsics.ll
@@ -4,9 +4,10 @@
 define <8 x i32> @test1(<4 x i32> %0, <4 x i32> %1) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false)
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
 entry:
   %2 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %0, i1 false)
@@ -48,8 +49,9 @@ entry:
 define <8 x i1> @test4(<4 x float> %0, <4 x float> %1) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> [[TMP2]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP0:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> [[TMP1:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i1> [[TMP3]]
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 862c07c3d2270..a65aae7db4f0c 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -7,12 +7,20 @@
 declare void @use_v4f64(<4 x double>)
 
 define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT:    ret <4 x double> [[POST]]
 ;
   %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -22,12 +30,20 @@ define <4 x double> @fadd_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_poison_idx(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 poison>
-; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_poison_idx(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 poison>
+; SSE-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_poison_idx(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
+; AVX-NEXT:    ret <4 x double> [[POST]]
 ;
   %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -37,12 +53,20 @@ define <4 x double> @fadd_v4f64_poison_idx(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @fadd_v4f64_mixed_types(<4 x double> %a, <2 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_mixed_types(
-; CHECK-SAME: <4 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_mixed_types(
+; SSE-SAME: <4 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_mixed_types(
+; AVX-SAME: <4 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[B1:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT:    [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT:    [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT:    ret <4 x double> [[POST]]
 ;
   %a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <2 x double> %b, <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -53,7 +77,7 @@ define <4 x double> @fadd_v4f64_mixed_types(<4 x double> %a, <2 x double> %b) {
 
 define <4 x float> @fadd_v4f32_mixed_types(<4 x float> %a0) {
 ; CHECK-LABEL: define <4 x float> @fadd_v4f32_mixed_types(
-; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> zeroinitializer, <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[POST:%.*]] = fmul <4 x float> [[TMP1]], <float 0.000000e+00, float 0.000000e+00, float undef, float undef>
 ; CHECK-NEXT:    ret <4 x float> [[POST]]
@@ -114,10 +138,11 @@ define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %
 define <4 x i32> @sdiv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: define <4 x i32> @sdiv_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[POST:%.*]] = sdiv <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[POST]]
+; CHECK-NEXT:    [[POST1:%.*]] = shufflevector <4 x i32> [[POST]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 0>
+; CHECK-NEXT:    ret <4 x i32> [[POST1]]
 ;
   %a1 = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %b1 = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
index 77b44d0e40e14..0ac14a7196b2b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
@@ -133,12 +133,19 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z
 ; type change via shuffle
 
 define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
-; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
-; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; SSE-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
+; SSE-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
+; SSE-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+; SSE-NEXT:    ret <8 x float> [[R]]
+;
+; AVX-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
+; AVX-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 0, i32 1, i32 1, i32 2>
+; AVX-NEXT:    [[R:%.*]] = fmul <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[R]]
 ;
   %b0 = fmul <4 x float> %x, %y
   %b1 = fmul <4 x float> %z, %x
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index fba4b60ef417b..3f4f51b671fb5 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -173,8 +173,9 @@ define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ;
 ; AVX-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
 ; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0]] to <8 x i32>
+; AVX-NEXT:    [[X1:%.*]] = sext <8 x i16> [[A1]] to <8 x i32>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
index 2588f9116f322..085158b58eef9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
@@ -32,28 +32,12 @@ define <8 x i16> @src_v4tov8_i16(<4 x i1> %a, <4 x i1> %b, <4 x i16> %x, <4 x i1
 }
 
 define <8 x i16> @src_v8tov8_i16(<8 x i1> %a, <8 x i1> %b, <8 x i16> %x, <8 x i16> %y, <8 x i16> %z) {
-; SSE-LABEL: define <8 x i16> @src_v8tov8_i16(
-; SSE-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B:%.*]], <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[A]], <8 x i1> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X]], <8 x i16> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[Z]], <8 x i16> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[RES:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]
-; SSE-NEXT:    ret <8 x i16> [[RES]]
-;
-; AVX2-LABEL: define <8 x i16> @src_v8tov8_i16(
-; AVX2-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B:%.*]], <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[A]], <8 x i1> [[B]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X]], <8 x i16> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[Z]], <8 x i16> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:    [[RES:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]
-; AVX2-NEXT:    ret <8 x i16> [[RES]]
-;
-; AVX512-LABEL: define <8 x i16> @src_v8tov8_i16(
-; AVX512-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B:%.*]], <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
-; AVX512-NEXT:    [[SELECT_XZ:%.*]] = select <8 x i1> [[A]], <8 x i16> [[X]], <8 x i16> [[Z]]
-; AVX512-NEXT:    [[SELECT_YX:%.*]] = select <8 x i1> [[B]], <8 x i16> [[Y]], <8 x i16> [[X]]
-; AVX512-NEXT:    [[RES:%.*]] = shufflevector <8 x i16> [[SELECT_XZ]], <8 x i16> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:    ret <8 x i16> [[RES]]
+; CHECK-LABEL: define <8 x i16> @src_v8tov8_i16(
+; CHECK-SAME: <8 x i1> [[A:%.*]], <8 x i1> [[B:%.*]], <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SELECT_XZ:%.*]] = select <8 x i1> [[A]], <8 x i16> [[X]], <8 x i16> [[Z]]
+; CHECK-NEXT:    [[SELECT_YX:%.*]] = select <8 x i1> [[B]], <8 x i16> [[Y]], <8 x i16> [[X]]
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <8 x i16> [[SELECT_XZ]], <8 x i16> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i16> [[RES]]
 ;
   %select.xz = select <8 x i1> %a, <8 x i16> %x, <8 x i16> %z
   %select.yx = select <8 x i1> %b, <8 x i16> %y, <8 x i16> %x



More information about the llvm-commits mailing list