[llvm] [SLP]Add cost estimation for gather node reshuffling (PR #115201)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 20 04:35:02 PST 2024


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/115201

>From 9e240ac5954cfb7700cf0aafd4b0600dcac9f590 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 6 Nov 2024 19:52:20 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 173 ++++++-
 .../AArch64/reused-scalar-repeated-in-node.ll |   8 +-
 .../SLPVectorizer/RISCV/complex-loads.ll      | 462 +++++++++---------
 .../X86/scatter-vectorize-reorder.ll          |   2 +-
 .../alternate-cmp-swapped-pred-parent.ll      |   6 +-
 .../extract-many-users-buildvector.ll         |  75 ++-
 ...hered-consecutive-loads-different-types.ll |  10 +-
 .../SLPVectorizer/reorder-clustered-node.ll   |  72 ++-
 .../resized-alt-shuffle-after-minbw.ll        |   6 +-
 9 files changed, 509 insertions(+), 305 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53e514766fee81..48419699f9cd53 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4846,8 +4846,21 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
                TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                int Index = 0, VectorType *SubTp = nullptr,
                ArrayRef<const Value *> Args = {}) {
-  if (Kind != TTI::SK_PermuteTwoSrc)
+  if (Kind != TTI::SK_PermuteTwoSrc) {
+    int SplatIdx = PoisonMaskElem;
+    if (!Mask.empty() && all_of(Mask, [&](int Idx) {
+          if (Idx == PoisonMaskElem)
+            return true;
+          if (SplatIdx == PoisonMaskElem) {
+            SplatIdx = Idx;
+            return true;
+          }
+          return SplatIdx == Idx;
+        }))
+      return TTI.getShuffleCost(TTI::SK_Broadcast, Tp, Mask, CostKind, Index,
+                                SubTp, Args);
     return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
+  }
   int NumSrcElts = Tp->getElementCount().getKnownMinValue();
   int NumSubElts;
   if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
@@ -10257,10 +10270,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             Idx = EMask[Idx];
         }
         CommonVF = E->Scalars.size();
-      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
-                 Factor && E->Scalars.size() != Mask.size() &&
+      } else if (unsigned Factor = E->getInterleaveFactor();
+                 Factor > 0 && E->Scalars.size() != Mask.size() &&
                  ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
-                                                               *Factor)) {
+                                                               Factor)) {
         // Deinterleaved nodes are free.
         std::iota(CommonMask.begin(), CommonMask.end(), 0);
       }
@@ -12935,6 +12948,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     // No perfect match, just shuffle, so choose the first tree node from the
     // tree.
     Entries.push_back(FirstEntries.front());
+    VF = FirstEntries.front()->getVectorFactor();
   } else {
     // Try to find nodes with the same vector factor.
     assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
@@ -12975,6 +12989,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       Entries.push_back(SecondEntries.front());
       VF = std::max(Entries.front()->getVectorFactor(),
                     Entries.back()->getVectorFactor());
+    } else {
+      VF = Entries.front()->getVectorFactor();
     }
   }
 
@@ -13077,26 +13093,149 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    unsigned Idx = Part * VL.size() + Pair.second;
+    int Idx = Part * VL.size() + Pair.second;
     Mask[Idx] =
         Pair.first * VF +
         (ForOrder ? std::distance(
                         Entries[Pair.first]->Scalars.begin(),
                         find(Entries[Pair.first]->Scalars, VL[Pair.second]))
                   : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
-    IsIdentity &= Mask[Idx] == Pair.second;
+    IsIdentity &= Mask[Idx] % VL.size() == Idx % VL.size();
   }
-  switch (Entries.size()) {
-  case 1:
-    if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
-      return TargetTransformInfo::SK_PermuteSingleSrc;
-    break;
-  case 2:
-    if (EntryLanes.size() > 2 || VL.size() <= 2)
-      return TargetTransformInfo::SK_PermuteTwoSrc;
-    break;
-  default:
-    break;
+  if (ForOrder || IsIdentity || Entries.empty()) {
+    switch (Entries.size()) {
+    case 1:
+      if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
+        return TargetTransformInfo::SK_PermuteSingleSrc;
+      break;
+    case 2:
+      if (EntryLanes.size() > 2 || VL.size() <= 2)
+        return TargetTransformInfo::SK_PermuteTwoSrc;
+      break;
+    default:
+      break;
+    }
+  } else if (!isa<VectorType>(VL.front()->getType()) &&
+             (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
+    // Do the cost estimation if shuffle beneficial than buildvector.
+    SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
+                             std::next(Mask.begin(), (Part + 1) * VL.size()));
+    int MinElement = SubMask.front(), MaxElement = SubMask.front();
+    for (int Idx : SubMask) {
+      if (Idx == PoisonMaskElem)
+        continue;
+      if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
+        MinElement = Idx;
+      if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
+        MaxElement = Idx;
+    }
+    assert(MaxElement >= 0 && MinElement >= 0 &&
+           "Expected at least single element.");
+    unsigned NewVF = std::max<unsigned>(
+        VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
+                                                 (MaxElement % VF) -
+                                                     (MinElement % VF) + 1));
+    if (NewVF < VF) {
+      for_each(SubMask, [&](int &Idx) {
+        if (Idx == PoisonMaskElem)
+          return;
+        Idx = (Idx % VF) - (MinElement % VF) +
+              (Idx >= static_cast<int>(VF) ? NewVF : 0);
+      });
+      VF = NewVF;
+    }
+
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    auto *VecTy = getWidenedType(VL.front()->getType(), VF);
+    auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
+    auto GetShuffleCost = [&,
+                           &TTI = *TTI](ArrayRef<int> Mask,
+                                        ArrayRef<const TreeEntry *> Entries,
+                                        VectorType *VecTy) -> InstructionCost {
+      if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
+          ShuffleVectorInst::isDeInterleaveMaskOfFactor(
+              Mask, Entries.front()->getInterleaveFactor()))
+        return TTI::TCC_Free;
+      return ::getShuffleCost(TTI,
+                              Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
+                                                 : TTI::SK_PermuteSingleSrc,
+                              VecTy, Mask, CostKind);
+    };
+    InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
+    InstructionCost FirstShuffleCost = 0;
+    SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
+    if (Entries.size() == 1 || !Entries[0]->isGather()) {
+      FirstShuffleCost = ShuffleCost;
+    } else {
+      // Transform mask to include only first entry.
+      APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+      bool IsIdentity = true;
+      for (auto [I, Idx] : enumerate(FirstMask)) {
+        if (Idx >= static_cast<int>(VF)) {
+          Idx = PoisonMaskElem;
+        } else {
+          DemandedElts.clearBit(I);
+          if (Idx != PoisonMaskElem)
+            IsIdentity &= static_cast<int>(I) == Idx;
+        }
+      }
+      if (!IsIdentity)
+        FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
+      FirstShuffleCost += TTI->getScalarizationOverhead(
+          MaskVecTy, DemandedElts, /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+    InstructionCost SecondShuffleCost = 0;
+    SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
+    if (Entries.size() == 1 || !Entries[1]->isGather()) {
+      SecondShuffleCost = ShuffleCost;
+    } else {
+      // Transform mask to include only first entry.
+      APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+      bool IsIdentity = true;
+      for (auto [I, Idx] : enumerate(SecondMask)) {
+        if (Idx < static_cast<int>(VF) && Idx >= 0) {
+          Idx = PoisonMaskElem;
+        } else {
+          DemandedElts.clearBit(I);
+          if (Idx != PoisonMaskElem) {
+            Idx -= VF;
+            IsIdentity &= static_cast<int>(I) == Idx;
+          }
+        }
+      }
+      if (!IsIdentity)
+        SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
+      SecondShuffleCost += TTI->getScalarizationOverhead(
+          MaskVecTy, DemandedElts, /*Insert=*/true,
+          /*Extract=*/false, CostKind);
+    }
+    APInt DemandedElts = APInt::getAllOnes(SubMask.size());
+    for (auto [I, Idx] : enumerate(SubMask))
+      if (Idx == PoisonMaskElem)
+        DemandedElts.clearBit(I);
+    InstructionCost BuildVectorCost =
+        TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
+                                      /*Extract=*/false, CostKind);
+    const TreeEntry *BestEntry = nullptr;
+    if (FirstShuffleCost < ShuffleCost) {
+      copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
+      BestEntry = Entries.front();
+      ShuffleCost = FirstShuffleCost;
+    }
+    if (SecondShuffleCost < ShuffleCost) {
+      copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
+      BestEntry = Entries[1];
+      ShuffleCost = SecondShuffleCost;
+    }
+    if (BuildVectorCost >= ShuffleCost) {
+      if (BestEntry) {
+        Entries.clear();
+        Entries.push_back(BestEntry);
+      }
+      return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
+                                : TargetTransformInfo::SK_PermuteSingleSrc;
+    }
   }
   Entries.clear();
   // Clear the corresponding mask elements.
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index d6073ea4bbbae6..96ff73f117a739 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -46,12 +46,12 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x float> [ poison, %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP17]], [[TMP13]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 6, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 1, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP20]], <16 x i32> <i32 0, i32 17, i32 2, i32 16, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP22]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 17, i32 6, i32 7, i32 8, i32 23, i32 10, i32 11, i32 12, i32 22, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 3, i32 1, i32 3, i32 9, i32 3, i32 1, i32 5, i32 13, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP14]], <2 x float> [[TMP0]], i64 2)
 ; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 912d60d0cc3867..f57e5de07807e6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -28,13 +28,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32>
-; CHECK-NEXT:    [[TMP84:%.*]] = zext i8 [[TMP29]] to i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
@@ -50,7 +46,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = zext i8 [[TMP33]] to i32
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]]
@@ -61,246 +56,203 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]]
-; CHECK-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]]
 ; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr i8, ptr null, i64 3
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr i8, ptr null, i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP60:%.*]] = load <4 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
+; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <4 x i8> [[TMP60]], <4 x i8> poison, <2 x i32> <i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP52]], i32 1
+; CHECK-NEXT:    [[TMP62:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP77:%.*]] = zext i8 [[TMP52]] to i32
-; CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <2 x ptr> <ptr poison, ptr null>, ptr [[ARRAYIDX22_3]], i32 0
+; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
 ; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <2 x i8> [[TMP48]], <2 x i8> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <2 x i8> [[TMP83]], i8 [[TMP33]], i32 0
 ; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], splat (i32 16)
-; CHECK-NEXT:    [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]]
-; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
-; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
-; CHECK-NEXT:    [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[TMP81:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]]
+; CHECK-NEXT:    [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16)
+; CHECK-NEXT:    [[TMP168:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]]
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <4 x i8> [[TMP60]], <4 x i8> poison, <2 x i32> <i32 3, i32 1>
 ; CHECK-NEXT:    [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32>
-; CHECK-NEXT:    [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i8> [[TMP53]], i8 [[TMP34]], i32 0
 ; CHECK-NEXT:    [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]]
-; CHECK-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison)
-; CHECK-NEXT:    [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]]
-; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16)
-; CHECK-NEXT:    [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1
-; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]]
-; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP78]], [[TMP71]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP74]], i32 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1
-; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]]
-; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0
-; CHECK-NEXT:    [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]]
-; CHECK-NEXT:    [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]]
-; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0
-; CHECK-NEXT:    [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0
-; CHECK-NEXT:    [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]]
-; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]]
-; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]]
-; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[TMP169:%.*]] = shufflevector <2 x i8> [[TMP53]], <2 x i8> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <2 x i8> [[TMP169]], i8 [[TMP43]], i32 0
+; CHECK-NEXT:    [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32>
+; CHECK-NEXT:    [[TMP172:%.*]] = insertelement <2 x i8> [[TMP48]], i8 [[TMP47]], i32 0
+; CHECK-NEXT:    [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32>
+; CHECK-NEXT:    [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16)
+; CHECK-NEXT:    [[TMP68:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]]
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP168]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP68]], <2 x i32> [[TMP42]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP30]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP152:%.*]] = add <2 x i32> [[TMP70]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = shufflevector <2 x i32> [[TMP68]], <2 x i32> [[TMP42]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP30]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP73]], [[TMP74]]
+; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
+; CHECK-NEXT:    [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1
+; CHECK-NEXT:    [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]]
+; CHECK-NEXT:    [[TMP220:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
+; CHECK-NEXT:    [[TMP221:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0
+; CHECK-NEXT:    [[SUB59_1:%.*]] = add i32 [[TMP221]], [[TMP220]]
+; CHECK-NEXT:    [[TMP222:%.*]] = sub <2 x i32> [[TMP152]], [[TMP75]]
+; CHECK-NEXT:    [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0
+; CHECK-NEXT:    [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1
+; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
+; CHECK-NEXT:    [[TMP223:%.*]] = shufflevector <2 x i32> [[TMP69]], <2 x i32> [[TMP44]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP69]], <2 x i32> [[TMP44]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP85:%.*]] = sub <2 x i32> [[TMP223]], [[TMP84]]
+; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
+; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP197]], 15
+; CHECK-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
+; CHECK-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1
+; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]]
+; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15
-; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
-; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]]
-; CHECK-NEXT:    [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15
-; CHECK-NEXT:    [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537
-; CHECK-NEXT:    [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535
-; CHECK-NEXT:    [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15
-; CHECK-NEXT:    [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537
-; CHECK-NEXT:    [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535
-; CHECK-NEXT:    [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]]
-; CHECK-NEXT:    [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]]
-; CHECK-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15
-; CHECK-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
-; CHECK-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
-; CHECK-NEXT:    [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
-; CHECK-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]]
-; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15
-; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
-; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
-; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
-; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32>
-; CHECK-NEXT:    [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
-; CHECK-NEXT:    [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]]
-; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], splat (i32 16)
-; CHECK-NEXT:    [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32>
-; CHECK-NEXT:    [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]]
-; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], splat (i32 16)
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP97]], [[TMP120]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP117]]
-; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]]
-; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
-; CHECK-NEXT:    [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
-; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1
-; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]]
-; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]]
-; CHECK-NEXT:    [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0
-; CHECK-NEXT:    [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1
-; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]]
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]]
-; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15
-; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
-; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
-; CHECK-NEXT:    [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15
-; CHECK-NEXT:    [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537
-; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
-; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
-; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
-; CHECK-NEXT:    [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
-; CHECK-NEXT:    [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]]
-; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], splat (i32 16)
-; CHECK-NEXT:    [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32>
-; CHECK-NEXT:    [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32>
-; CHECK-NEXT:    [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]]
-; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16)
-; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]]
-; CHECK-NEXT:    [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]]
-; CHECK-NEXT:    [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]]
-; CHECK-NEXT:    [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]]
-; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]]
-; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0
-; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]]
-; CHECK-NEXT:    [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]]
-; CHECK-NEXT:    [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0
-; CHECK-NEXT:    [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]]
-; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15
+; CHECK-NEXT:    [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32>
+; CHECK-NEXT:    [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
+; CHECK-NEXT:    [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32>
+; CHECK-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]]
+; CHECK-NEXT:    [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16)
+; CHECK-NEXT:    [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32>
+; CHECK-NEXT:    [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
+; CHECK-NEXT:    [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32>
+; CHECK-NEXT:    [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16)
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
+; CHECK-NEXT:    [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]]
+; CHECK-NEXT:    [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
+; CHECK-NEXT:    [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]]
+; CHECK-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP118:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]]
+; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]]
+; CHECK-NEXT:    [[TMP120:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0
+; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1
+; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP121]], [[TMP120]]
+; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> [[TMP44]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP123:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> [[TMP44]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP122]], [[TMP123]]
+; CHECK-NEXT:    [[TMP125:%.*]] = add <2 x i32> [[TMP122]], [[TMP123]]
+; CHECK-NEXT:    [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> [[TMP125]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD113_2]], [[TMP127]]
+; CHECK-NEXT:    [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0
+; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP121]], 15
+; CHECK-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
+; CHECK-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
+; CHECK-NEXT:    [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
+; CHECK-NEXT:    [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32>
+; CHECK-NEXT:    [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32>
+; CHECK-NEXT:    [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32>
+; CHECK-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]]
+; CHECK-NEXT:    [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16)
+; CHECK-NEXT:    [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32>
+; CHECK-NEXT:    [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32>
+; CHECK-NEXT:    [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32>
+; CHECK-NEXT:    [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]]
+; CHECK-NEXT:    [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16)
+; CHECK-NEXT:    [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1
+; CHECK-NEXT:    [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]]
+; CHECK-NEXT:    [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]]
+; CHECK-NEXT:    [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]]
+; CHECK-NEXT:    [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]]
+; CHECK-NEXT:    [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]]
+; CHECK-NEXT:    [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]]
+; CHECK-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0
+; CHECK-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]]
+; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]]
+; CHECK-NEXT:    [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15
 ; CHECK-NEXT:    [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537
 ; CHECK-NEXT:    [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535
-; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15
+; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
-; CHECK-NEXT:    [[TMP154:%.*]] = lshr <2 x i32> [[TMP110]], splat (i32 15)
-; CHECK-NEXT:    [[TMP184:%.*]] = and <2 x i32> [[TMP154]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP195:%.*]] = mul <2 x i32> [[TMP184]], splat (i32 65535)
-; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]]
-; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]]
+; CHECK-NEXT:    [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15)
+; CHECK-NEXT:    [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535)
+; CHECK-NEXT:    [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]]
+; CHECK-NEXT:    [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]]
 ; CHECK-NEXT:    [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]]
 ; CHECK-NEXT:    [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]]
-; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]]
-; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
+; CHECK-NEXT:    [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]]
+; CHECK-NEXT:    [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]]
-; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]]
+; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP197]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]]
-; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]]
-; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
-; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]]
+; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]]
+; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
+; CHECK-NEXT:    [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP121]]
 ; CHECK-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; CHECK-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
-; CHECK-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; CHECK-NEXT:    [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]]
+; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
+; CHECK-NEXT:    [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]]
 ; CHECK-NEXT:    [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]]
-; CHECK-NEXT:    [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]]
-; CHECK-NEXT:    [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]]
-; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]]
-; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]]
-; CHECK-NEXT:    [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]]
-; CHECK-NEXT:    [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]]
-; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]]
-; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]]
-; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]]
-; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
-; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]]
-; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]]
-; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
-; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
-; CHECK-NEXT:    [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]]
-; CHECK-NEXT:    [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]]
-; CHECK-NEXT:    [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0
-; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
-; CHECK-NEXT:    [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP164]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]]
-; CHECK-NEXT:    [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]]
-; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]]
-; CHECK-NEXT:    [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]]
-; CHECK-NEXT:    [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]]
-; CHECK-NEXT:    [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]]
-; CHECK-NEXT:    [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]]
-; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15
-; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
-; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
-; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
-; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]]
-; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]]
-; CHECK-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1
-; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]]
-; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
-; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]]
-; CHECK-NEXT:    [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]]
+; CHECK-NEXT:    [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]]
+; CHECK-NEXT:    [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]]
+; CHECK-NEXT:    [[TMP166:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0
+; CHECK-NEXT:    [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]]
 ; CHECK-NEXT:    [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0
 ; CHECK-NEXT:    [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0
@@ -308,21 +260,83 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP181:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]]
 ; CHECK-NEXT:    [[TMP182:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]]
 ; CHECK-NEXT:    [[TMP183:%.*]] = shufflevector <2 x i32> [[TMP181]], <2 x i32> [[TMP182]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]]
-; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]]
+; CHECK-NEXT:    [[TMP174:%.*]] = insertelement <2 x i32> [[TMP119]], i32 [[CONV_1]], i32 0
+; CHECK-NEXT:    [[TMP175:%.*]] = lshr <2 x i32> [[TMP174]], splat (i32 15)
+; CHECK-NEXT:    [[TMP226:%.*]] = and <2 x i32> [[TMP175]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP227:%.*]] = mul <2 x i32> [[TMP226]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP222]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP229:%.*]] = insertelement <2 x i32> [[TMP228]], i32 [[ADD55]], i32 1
+; CHECK-NEXT:    [[TMP230:%.*]] = insertelement <2 x i32> [[TMP222]], i32 [[ADD55_1]], i32 1
+; CHECK-NEXT:    [[TMP231:%.*]] = sub <2 x i32> [[TMP229]], [[TMP230]]
+; CHECK-NEXT:    [[TMP232:%.*]] = insertelement <2 x i32> poison, i32 [[SUB51_1]], i32 0
+; CHECK-NEXT:    [[TMP233:%.*]] = insertelement <2 x i32> [[TMP232]], i32 [[ADD113_2]], i32 1
+; CHECK-NEXT:    [[TMP184:%.*]] = sub <2 x i32> [[TMP126]], [[TMP233]]
+; CHECK-NEXT:    [[TMP234:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP235:%.*]] = zext <2 x i8> [[TMP234]] to <2 x i32>
+; CHECK-NEXT:    [[TMP236:%.*]] = lshr <2 x i32> [[TMP235]], splat (i32 15)
+; CHECK-NEXT:    [[TMP237:%.*]] = and <2 x i32> [[TMP236]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP238:%.*]] = mul <2 x i32> [[TMP237]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP239:%.*]] = shufflevector <2 x i32> [[TMP184]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP191:%.*]] = insertelement <2 x i32> [[TMP239]], i32 [[ADD94_1]], i32 1
+; CHECK-NEXT:    [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP231]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP240:%.*]] = insertelement <2 x i32> [[TMP192]], i32 [[ADD78_1]], i32 1
+; CHECK-NEXT:    [[TMP194:%.*]] = add <2 x i32> [[TMP191]], [[TMP240]]
+; CHECK-NEXT:    [[TMP195:%.*]] = add <2 x i32> [[TMP238]], [[TMP194]]
+; CHECK-NEXT:    [[TMP196:%.*]] = xor <2 x i32> [[TMP195]], [[TMP235]]
+; CHECK-NEXT:    [[MUL_I51_4:%.*]] = extractelement <2 x i32> [[TMP196]], i32 0
 ; CHECK-NEXT:    [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]]
-; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]]
+; CHECK-NEXT:    [[TMP198:%.*]] = extractelement <2 x i32> [[TMP196]], i32 1
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD_I52_3]], [[TMP198]]
+; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]]
+; CHECK-NEXT:    [[TMP199:%.*]] = add <2 x i32> [[TMP231]], [[TMP184]]
+; CHECK-NEXT:    [[TMP200:%.*]] = sub <2 x i32> [[TMP231]], [[TMP184]]
+; CHECK-NEXT:    [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP202:%.*]] = add <2 x i32> [[TMP227]], [[TMP201]]
+; CHECK-NEXT:    [[TMP203:%.*]] = xor <2 x i32> [[TMP202]], [[TMP174]]
+; CHECK-NEXT:    [[TMP204:%.*]] = extractelement <2 x i32> [[TMP203]], i32 1
+; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]]
+; CHECK-NEXT:    [[TMP205:%.*]] = extractelement <2 x i32> [[TMP231]], i32 0
+; CHECK-NEXT:    [[TMP206:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0
+; CHECK-NEXT:    [[TMP207:%.*]] = shufflevector <2 x i32> [[TMP184]], <2 x i32> [[TMP231]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SUB106_2:%.*]] = sub i32 [[TMP206]], [[TMP205]]
+; CHECK-NEXT:    [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP183]]
+; CHECK-NEXT:    [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]]
+; CHECK-NEXT:    [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15
+; CHECK-NEXT:    [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537
+; CHECK-NEXT:    [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535
+; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
+; CHECK-NEXT:    [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]]
+; CHECK-NEXT:    [[TMP210:%.*]] = extractelement <2 x i32> [[TMP203]], i32 0
+; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[TMP210]], [[ADD113_1]]
+; CHECK-NEXT:    [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]]
+; CHECK-NEXT:    [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1
+; CHECK-NEXT:    [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]]
+; CHECK-NEXT:    [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]]
+; CHECK-NEXT:    [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]]
+; CHECK-NEXT:    [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]]
+; CHECK-NEXT:    [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0
+; CHECK-NEXT:    [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0
+; CHECK-NEXT:    [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]]
+; CHECK-NEXT:    [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]]
+; CHECK-NEXT:    [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]]
+; CHECK-NEXT:    [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]]
+; CHECK-NEXT:    [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]]
 ; CHECK-NEXT:    [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP183]]
+; CHECK-NEXT:    [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]]
 ; CHECK-NEXT:    [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]]
 ; CHECK-NEXT:    [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15
 ; CHECK-NEXT:    [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537
 ; CHECK-NEXT:    [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
-; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
+; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]]
 ; CHECK-NEXT:    [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0
 ; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]]
 ; CHECK-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index 360b258f216c56..f875d45db61dde 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -14,7 +14,7 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
index 371b23019498d1..afca39ad8938a3 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-cmp-swapped-pred-parent.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 0, i16 poison>, i16 [[CALL37]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL]], i32 5
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL37]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
@@ -43,7 +44,8 @@ define void @test1() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison, i16 0>, i16 [[CALL]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[CALL37]], i32 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison>, i16 [[CALL]], i32 6
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[CALL37]], i32 7
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i16> [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
index 261ec2b3935d7e..40568f9c8a5092 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll
@@ -1,31 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(float %0, double %1) {
-; CHECK-LABEL: define i1 @test
-; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
-; CHECK-NEXT:    ret i1 [[TMP22]]
+; X86-LABEL: define i1 @test
+; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; X86-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 1, i32 7>
+; X86-NEXT:    [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; X86-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP10]], i64 0)
+; X86-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; X86-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4)
+; X86-NEXT:    [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]]
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float>
+; X86-NEXT:    [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer
+; X86-NEXT:    [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer
+; X86-NEXT:    [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]])
+; X86-NEXT:    ret i1 [[TMP22]]
+;
+; AARCH64-LABEL: define i1 @test
+; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) {
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP0]], i32 3
+; AARCH64-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double>
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[TMP1]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 poison, i32 0, i32 3, i32 3>
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> <double 0.000000e+00, double poison, double poison, double poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> <double poison, double poison, double poison, double 0.000000e+00>, <4 x i32> <i32 2, i32 0, i32 poison, i32 7>
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fmul <4 x double> [[TMP8]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]]
+; AARCH64-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP11]], i64 0)
+; AARCH64-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> <double poison, double poison, double poison, double poison, double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP12]], i64 0)
+; AARCH64-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP14]], <2 x double> [[TMP6]], i64 4)
+; AARCH64-NEXT:    [[TMP16:%.*]] = fsub <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP17:%.*]] = fmul <8 x double> [[TMP13]], [[TMP15]]
+; AARCH64-NEXT:    [[TMP18:%.*]] = shufflevector <8 x double> [[TMP16]], <8 x double> [[TMP17]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP19:%.*]] = fptrunc <8 x double> [[TMP18]] to <8 x float>
+; AARCH64-NEXT:    [[TMP20:%.*]] = fmul <8 x float> [[TMP19]], zeroinitializer
+; AARCH64-NEXT:    [[TMP21:%.*]] = fcmp oeq <8 x float> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = freeze <8 x i1> [[TMP21]]
+; AARCH64-NEXT:    [[TMP23:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP22]])
+; AARCH64-NEXT:    ret i1 [[TMP23]]
 ;
   %3 = fpext float %0 to double
   %4 = fpext float 0.000000e+00 to double
diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
index a854c61db6d28e..a42c8f2c650aec 100644
--- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
@@ -11,8 +11,8 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr null, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 poison, i32 poison, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i32> <i32 8, i32 9, i32 0, i32 11, i32 12, i32 1, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> <i8 0, i8 0, i8 poison, i8 0, i8 0, i8 poison, i8 0, i8 0>, <8 x i8> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i48> <i48 poison, i48 0, i48 0, i48 0>, i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0
@@ -21,9 +21,9 @@ define i32 @test(i8 %0) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = load volatile i8, ptr null, align 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, <8 x i32> <i32 8, i32 poison, i32 10, i32 3, i32 4, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x i8> [[TMP16]], i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 0, i8 0, i8 0>, i8 [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 0, i8 0, i8 0, i8 0>, i8 [[TMP0]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[TMP13]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP17]], [[TMP19]]
diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
index 561182d5e4f49d..940ee5b95871d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
@@ -1,30 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s --check-prefix X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s --check-prefix AARCH64 %}
 
 define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
-; CHECK-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
-; CHECK-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
-; CHECK-NEXT:    ret i1 [[OP_RDX]]
+; X86-LABEL: @test(
+; X86-NEXT:  bb:
+; X86-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; X86-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; X86-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; X86-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; X86-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; X86-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; X86-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1
+; X86-NEXT:    [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2
+; X86-NEXT:    [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3
+; X86-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; X86-NEXT:    [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
+; X86-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
+; X86-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
+; X86-NEXT:    ret i1 [[OP_RDX]]
+;
+; AARCH64-LABEL: @test(
+; AARCH64-NEXT:  bb:
+; AARCH64-NEXT:    [[I226:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i32 7
+; AARCH64-NEXT:    [[I242:%.*]] = getelementptr double, ptr [[I233:%.*]], i64 [[I241:%.*]]
+; AARCH64-NEXT:    [[I245:%.*]] = getelementptr double, ptr [[I235:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I248:%.*]] = getelementptr double, ptr [[I237:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[I250:%.*]] = getelementptr double, ptr [[I227:%.*]], i64 [[I241]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x ptr>, ptr [[I226]], align 8
+; AARCH64-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, ptr [[I242]], i32 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2
+; AARCH64-NEXT:    [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
+; AARCH64-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
+; AARCH64-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AARCH64-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
+; AARCH64-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
+; AARCH64-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
   %i226 = getelementptr ptr, ptr %arg, i32 7
diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
index 61a84a67c9ff19..056b6222cae726 100644
--- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
+++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll
@@ -13,9 +13,9 @@ define void @func(i32 %0) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i32> [[TMP11]], <32 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <32 x i32> <i32 poison, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <32 x i32> [[TMP12]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, i32 [[TMP11]], i32 30
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i32> [[TMP12]], <32 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 30, i32 30>
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v8i32(<32 x i32> [[TMP13]], <8 x i32> zeroinitializer, i64 16)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v4i32(<32 x i32> [[TMP14]], <4 x i32> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.vector.insert.v32i32.v2i32(<32 x i32> [[TMP15]], <2 x i32> zeroinitializer, i64 14)



More information about the llvm-commits mailing list