[llvm] 417fe52 - Revert "[SLP] Check with target before vectorizing GEP Indices."

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 28 00:07:18 PDT 2023


Author: Florian Hahn
Date: 2023-03-28T08:06:53+01:00
New Revision: 417fe52e6fb44c727eb4a63b6eb49377b692bdd8

URL: https://github.com/llvm/llvm-project/commit/417fe52e6fb44c727eb4a63b6eb49377b692bdd8
DIFF: https://github.com/llvm/llvm-project/commit/417fe52e6fb44c727eb4a63b6eb49377b692bdd8.diff

LOG: Revert "[SLP] Check with target before vectorizing GEP Indices."

This reverts commit 1387a13e1d0bac94457626ef3e7427c84caf6e65.

This introduced performance regressions on AArch64, when the cost of a
vector GEP + extracts is offset by the benefits of vectorizing the rest
of the tree.

The test in llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
illustrates the issue. It was extracted from code that regressed a SPEC
benchmark by 15%.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
    llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
    llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
    llvm/test/Transforms/SLPVectorizer/X86/partail.ll

Removed: 
    llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ba8e04538d383..1d5da9aee18d8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5250,9 +5250,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                           Depth](ArrayRef<Value *> VL) {
     if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
       return false;
-    if (S.getOpcode() == Instruction::GetElementPtr &&
-        !TTI->prefersVectorizedAddressing())
-      return true;
     if (VectorizableTree.size() < MinTreeSize)
       return false;
     if (Depth >= RecursionMaxDepth - 1)
@@ -12130,23 +12127,21 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
       if (!isValidElementType(SI->getValueOperand()->getType()))
         continue;
       Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
-      continue;
     }
 
     // Ignore getelementptr instructions that have more than one index, a
     // constant index, or a pointer operand that doesn't point to a scalar
     // type.
-    if (TTI->prefersVectorizedAddressing())
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        auto Idx = GEP->idx_begin()->get();
-        if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
-          continue;
-        if (!isValidElementType(Idx->getType()))
-          continue;
-        if (GEP->getType()->isVectorTy())
-          continue;
-        GEPs[GEP->getPointerOperand()].push_back(GEP);
-      }
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      auto Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        continue;
+      if (!isValidElementType(Idx->getType()))
+        continue;
+      if (GEP->getType()->isVectorTy())
+        continue;
+      GEPs[GEP->getPointerOperand()].push_back(GEP);
+    }
   }
 }
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 2faec3047e44a..e3ad918498ee7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -12,21 +12,18 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
-; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
@@ -61,25 +58,23 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
-; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
-; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
-; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
-; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
-; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
-; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
-; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
-; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index cf4aba2a61c18..de9859df3aad9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,92 +36,59 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
-; GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
-; GENERIC-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
-; GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
-; GENERIC-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
-; GENERIC-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
-; GENERIC-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
 ; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
-; GENERIC-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
-; GENERIC-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
-; GENERIC-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
-; GENERIC-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
-; GENERIC-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
-; GENERIC-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
-; GENERIC-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
-; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
-; GENERIC-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
-; GENERIC-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
-; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
-; GENERIC-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
-; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
-; GENERIC-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
-; GENERIC-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
-; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
-; GENERIC-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
-; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
-; GENERIC-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
-; GENERIC-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
-; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
-; GENERIC-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
-; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
-; GENERIC-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
-; GENERIC-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
-; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
-; GENERIC-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
-; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
-; GENERIC-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
-; GENERIC-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
-; GENERIC-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
-; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
-; GENERIC-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
-; GENERIC-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -142,92 +109,59 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
-; KRYO-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
-; KRYO-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
-; KRYO-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
-; KRYO-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
-; KRYO-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
-; KRYO-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
 ; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
-; KRYO-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
-; KRYO-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
-; KRYO-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
-; KRYO-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
-; KRYO-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
-; KRYO-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
-; KRYO-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
-; KRYO-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
-; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
-; KRYO-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
-; KRYO-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
-; KRYO-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
-; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
-; KRYO-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
-; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
-; KRYO-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
-; KRYO-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
-; KRYO-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
-; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
-; KRYO-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
-; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
-; KRYO-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
-; KRYO-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
-; KRYO-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
-; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
-; KRYO-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
-; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
-; KRYO-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
-; KRYO-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
-; KRYO-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
-; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
-; KRYO-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
-; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
-; KRYO-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
-; KRYO-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
-; KRYO-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
-; KRYO-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
-; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
-; KRYO-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
-; KRYO-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
-; KRYO-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -359,92 +293,59 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
-; GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
-; GENERIC-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
-; GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
-; GENERIC-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
-; GENERIC-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
-; GENERIC-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
 ; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
-; GENERIC-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
-; GENERIC-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
-; GENERIC-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
-; GENERIC-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
-; GENERIC-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
-; GENERIC-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
-; GENERIC-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
-; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
-; GENERIC-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
-; GENERIC-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
-; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
-; GENERIC-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
-; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
-; GENERIC-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
-; GENERIC-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
-; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
-; GENERIC-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
-; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
-; GENERIC-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
-; GENERIC-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
-; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
-; GENERIC-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
-; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
-; GENERIC-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
-; GENERIC-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
-; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
-; GENERIC-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
-; GENERIC-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
-; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
-; GENERIC-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
-; GENERIC-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
-; GENERIC-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
-; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
-; GENERIC-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
-; GENERIC-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -465,92 +366,59 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
-; KRYO-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
-; KRYO-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
-; KRYO-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
-; KRYO-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
-; KRYO-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
-; KRYO-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
 ; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
-; KRYO-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
-; KRYO-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
-; KRYO-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
-; KRYO-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
-; KRYO-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
-; KRYO-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
-; KRYO-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
-; KRYO-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
-; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
-; KRYO-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
-; KRYO-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
-; KRYO-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
-; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
-; KRYO-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
-; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
-; KRYO-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
-; KRYO-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
-; KRYO-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
-; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
-; KRYO-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
-; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
-; KRYO-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
-; KRYO-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
-; KRYO-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
-; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
-; KRYO-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
-; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
-; KRYO-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
-; KRYO-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
-; KRYO-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
-; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
-; KRYO-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
-; KRYO-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
-; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
-; KRYO-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
-; KRYO-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
-; KRYO-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
-; KRYO-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
-; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
-; KRYO-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
-; KRYO-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
-; KRYO-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 5a4d7086c9b1b..a567b6c71e898 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -24,25 +24,25 @@ target triple = "aarch64--linux-gnu"
 ;
 
 ; YAML-LABEL: Function:        getelementptr_4x32
-; YAML:      --- !Missed
+; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            NotBeneficial
+; YAML-NEXT: Name:            VectorizedList
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
-; YAML-NEXT:   - Cost:            '-7'
-; YAML-NEXT:   - String:          ' >= '
-; YAML-NEXT:   - Treshold:        '7'
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
 
-; YAML:      --- !Missed
+; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            NotBeneficial
+; YAML-NEXT: Name:            VectorizedList
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
-; YAML-NEXT:   - Cost:            '-7'
-; YAML-NEXT:   - String:          ' >= '
-; YAML-NEXT:   - Treshold:        '7'
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
 
 define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @getelementptr_4x32(
@@ -50,6 +50,9 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -60,23 +63,28 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[T4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[T7:%.*]] = add nsw i32 [[T4]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[T7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[T9:%.*]] = add nsw i32 [[T4]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[T9]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[T11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -123,15 +131,15 @@ for.body:
 }
 
 ; YAML-LABEL: Function:        getelementptr_2x32
-; YAML:      --- !Missed
+; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            NotBeneficial
+; YAML-NEXT: Name:            VectorizedList
 ; YAML-NEXT: Function:        getelementptr_2x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
-; YAML-NEXT:   - Cost:            '-7'
-; YAML-NEXT:   - String:          ' >= '
-; YAML-NEXT:   - Treshold:        '7'
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
 
 define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @getelementptr_2x32(
@@ -139,6 +147,8 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -149,23 +159,26 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[T4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
 ; CHECK-NEXT:    [[T7:%.*]] = or i32 [[T4]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[T7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[T7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[T9:%.*]] = add nsw i32 [[T4]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[T9]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[T11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -219,84 +232,44 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) {
 ; CHECK-LABEL: @test_i16_extend(
 ; CHECK-NEXT:    [[P_0:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
-; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_0]], 1
-; CHECK-NEXT:    [[IDX_2:%.*]] = add nuw nsw i64 [[IDX_0]], 2
-; CHECK-NEXT:    [[IDX_3:%.*]] = add nuw nsw i64 [[IDX_0]], 3
-; CHECK-NEXT:    [[IDX_4:%.*]] = add nuw nsw i64 [[IDX_0]], 4
-; CHECK-NEXT:    [[IDX_5:%.*]] = add nuw nsw i64 [[IDX_0]], 5
-; CHECK-NEXT:    [[IDX_6:%.*]] = add nuw nsw i64 [[IDX_0]], 6
-; CHECK-NEXT:    [[IDX_7:%.*]] = add nuw nsw i64 [[IDX_0]], 7
 ; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[OP1_L:%.*]] = load i16, ptr [[T53]], align 2
-; CHECK-NEXT:    [[OP1_EXT:%.*]] = zext i16 [[OP1_L]] to i64
 ; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[OP2_L:%.*]] = load i16, ptr [[T56]], align 2
-; CHECK-NEXT:    [[OP2_EXT:%.*]] = zext i16 [[OP2_L]] to i64
-; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i64 [[OP1_EXT]], [[OP2_EXT]]
-; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[T53]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[T56]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[T60]], align 4
-; CHECK-NEXT:    [[T64:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_1]]
-; CHECK-NEXT:    [[T65:%.*]] = load i16, ptr [[T64]], align 2
-; CHECK-NEXT:    [[T66:%.*]] = zext i16 [[T65]] to i64
-; CHECK-NEXT:    [[T67:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_1]]
-; CHECK-NEXT:    [[T68:%.*]] = load i16, ptr [[T67]], align 2
-; CHECK-NEXT:    [[T69:%.*]] = zext i16 [[T68]] to i64
-; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i64 [[T66]], [[T69]]
-; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[T71]], align 4
-; CHECK-NEXT:    [[T75:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_2]]
-; CHECK-NEXT:    [[T76:%.*]] = load i16, ptr [[T75]], align 2
-; CHECK-NEXT:    [[T77:%.*]] = zext i16 [[T76]] to i64
-; CHECK-NEXT:    [[T78:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_2]]
-; CHECK-NEXT:    [[T79:%.*]] = load i16, ptr [[T78]], align 2
-; CHECK-NEXT:    [[T80:%.*]] = zext i16 [[T79]] to i64
-; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i64 [[T77]], [[T80]]
-; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[T82]], align 4
-; CHECK-NEXT:    [[T86:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_3]]
-; CHECK-NEXT:    [[T87:%.*]] = load i16, ptr [[T86]], align 2
-; CHECK-NEXT:    [[T88:%.*]] = zext i16 [[T87]] to i64
-; CHECK-NEXT:    [[T89:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_3]]
-; CHECK-NEXT:    [[T90:%.*]] = load i16, ptr [[T89]], align 2
-; CHECK-NEXT:    [[T91:%.*]] = zext i16 [[T90]] to i64
-; CHECK-NEXT:    [[SUB_4:%.*]] = sub nsw i64 [[T88]], [[T91]]
-; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[L_4:%.*]] = load i32, ptr [[T93]], align 4
-; CHECK-NEXT:    [[T97:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_4]]
-; CHECK-NEXT:    [[T98:%.*]] = load i16, ptr [[T97]], align 2
-; CHECK-NEXT:    [[T99:%.*]] = zext i16 [[T98]] to i64
-; CHECK-NEXT:    [[T100:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_4]]
-; CHECK-NEXT:    [[T101:%.*]] = load i16, ptr [[T100]], align 2
-; CHECK-NEXT:    [[T102:%.*]] = zext i16 [[T101]] to i64
-; CHECK-NEXT:    [[SUB_5:%.*]] = sub nsw i64 [[T99]], [[T102]]
-; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[L_5:%.*]] = load i32, ptr [[T104]], align 4
-; CHECK-NEXT:    [[T108:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_5]]
-; CHECK-NEXT:    [[T109:%.*]] = load i16, ptr [[T108]], align 2
-; CHECK-NEXT:    [[T110:%.*]] = zext i16 [[T109]] to i64
-; CHECK-NEXT:    [[T111:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_5]]
-; CHECK-NEXT:    [[T112:%.*]] = load i16, ptr [[T111]], align 2
-; CHECK-NEXT:    [[T113:%.*]] = zext i16 [[T112]] to i64
-; CHECK-NEXT:    [[SUB_6:%.*]] = sub nsw i64 [[T110]], [[T113]]
-; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_6]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP7]], i64 5
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[L_6:%.*]] = load i32, ptr [[T115]], align 4
-; CHECK-NEXT:    [[T119:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_6]]
-; CHECK-NEXT:    [[T120:%.*]] = load i16, ptr [[T119]], align 2
-; CHECK-NEXT:    [[T121:%.*]] = zext i16 [[T120]] to i64
-; CHECK-NEXT:    [[T122:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_6]]
-; CHECK-NEXT:    [[T123:%.*]] = load i16, ptr [[T122]], align 2
-; CHECK-NEXT:    [[T124:%.*]] = zext i16 [[T123]] to i64
-; CHECK-NEXT:    [[SUB_7:%.*]] = sub nsw i64 [[T121]], [[T124]]
-; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP7]], i64 6
+; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
+; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[L_7:%.*]] = load i32, ptr [[T126]], align 4
-; CHECK-NEXT:    [[T130:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_7]]
-; CHECK-NEXT:    [[T131:%.*]] = load i16, ptr [[T130]], align 2
-; CHECK-NEXT:    [[T132:%.*]] = zext i16 [[T131]] to i64
-; CHECK-NEXT:    [[T133:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_7]]
-; CHECK-NEXT:    [[T134:%.*]] = load i16, ptr [[T133]], align 2
-; CHECK-NEXT:    [[T135:%.*]] = zext i16 [[T134]] to i64
-; CHECK-NEXT:    [[SUB_8:%.*]] = sub nsw i64 [[T132]], [[T135]]
-; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP7]], i64 7
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[L_8:%.*]] = load i32, ptr [[T137]], align 4
 ; CHECK-NEXT:    call void @use(i32 [[L_1]], i32 [[L_2]], i32 [[L_3]], i32 [[L_4]], i32 [[L_5]], i32 [[L_6]], i32 [[L_7]], i32 [[L_8]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
index 14ce08cb7aebe..3cb81b72d26a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
@@ -6,36 +6,19 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
 ; CHECK-LABEL: define void @should_vectorize_gep
 ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
-; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
-; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
-; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
-; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
-; CHECK-NEXT:    [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
-; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
-; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
-; CHECK-NEXT:    [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
-; CHECK-NEXT:    [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
-; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
-; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
-; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
-; CHECK-NEXT:    [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
-; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
-; CHECK-NEXT:    [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
-; CHECK-NEXT:    [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
-; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
-; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
-; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
-; CHECK-NEXT:    [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
-; CHECK-NEXT:    [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
-; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
-; CHECK-NEXT:    [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
 ; CHECK-NEXT:    call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll
deleted file mode 100644
index 348fde129c855..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
-; RUN:   | FileCheck %s
-;
-; Test that gep indices are not first vectorized and then extracted (into address registers).
-
-%StructTy = type { i8, i64, i64, i64, i64 }
-declare void @bar(ptr, ptr)
-
-define void @fun(ptr %Addr) {
-; CHECK-LABEL: @fun(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[P2472:%.*]] = getelementptr inbounds [[STRUCTTY:%.*]], ptr [[ADDR:%.*]], i64 [[INDVARS_IV]], i32 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P2472]], align 8
-; CHECK-NEXT:    [[P3476:%.*]] = getelementptr inbounds [[STRUCTTY]], ptr [[ADDR]], i64 [[INDVARS_IV]], i32 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[P3476]], align 8
-; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP0]], 32
-; CHECK-NEXT:    [[IDXPROM495:%.*]] = ashr exact i64 [[SEXT]], 32
-; CHECK-NEXT:    [[ARRAYIDX496:%.*]] = getelementptr inbounds [3 x float], ptr null, i64 [[IDXPROM495]]
-; CHECK-NEXT:    [[SEXT4:%.*]] = shl i64 [[TMP1]], 32
-; CHECK-NEXT:    [[IDXPROM499:%.*]] = ashr exact i64 [[SEXT4]], 32
-; CHECK-NEXT:    [[ARRAYIDX500:%.*]] = getelementptr inbounds [3 x float], ptr null, i64 [[IDXPROM499]]
-; CHECK-NEXT:    tail call void @bar(ptr noundef poison, ptr noundef [[ARRAYIDX500]])
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    br label [[FOR_COND]]
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond ], [ 0, %entry ]
-  %P2472 = getelementptr inbounds %StructTy, ptr %Addr, i64 %indvars.iv, i32 3
-  %0 = load i64, ptr %P2472, align 8
-  %P3476 = getelementptr inbounds %StructTy, ptr %Addr, i64 %indvars.iv, i32 4
-  %1 = load i64, ptr %P3476, align 8
-  %sext = shl i64 %0, 32
-  %idxprom495 = ashr exact i64 %sext, 32
-  %arrayidx496 = getelementptr inbounds [3 x float], ptr null, i64 %idxprom495
-  %sext4 = shl i64 %1, 32
-  %idxprom499 = ashr exact i64 %sext4, 32
-  %arrayidx500 = getelementptr inbounds [3 x float], ptr null, i64 %idxprom499
-  tail call void @bar(ptr noundef poison, ptr noundef %arrayidx500)
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  br label %for.cond
-}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 69eded4e96740..5b0ecdb779d23 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -134,49 +134,24 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 }
 
 define void @PR43578_prefer128(ptr %r, ptr %p, ptr %q) #0 {
-; AVX2-LABEL: @PR43578_prefer128(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
-; AVX2-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 1
-; AVX2-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
-; AVX2-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
-; AVX2-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 2
-; AVX2-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
-; AVX2-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
-; AVX2-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
-; AVX2-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q]], align 2
-; AVX2-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
-; AVX2-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
-; AVX2-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
-; AVX2-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
-; AVX2-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
-; AVX2-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
-; AVX2-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
-; AVX2-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
-; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @PR43578_prefer128(
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
-; AVX512-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
-; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
-; AVX512-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
-; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
-; AVX512-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; AVX512-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; AVX512-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @PR43578_prefer128(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
+; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr inbounds i64, ptr %p, i64 1
   %p2 = getelementptr inbounds i64, ptr %p, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index c7f2e174986b7..49b88f85968a7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -134,49 +134,24 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 }
 
 define void @PR43578_prefer128(ptr %r, ptr %p, ptr %q) #0 {
-; AVX2-LABEL: @PR43578_prefer128(
-; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
-; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
-; AVX2-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 1
-; AVX2-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
-; AVX2-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
-; AVX2-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 2
-; AVX2-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
-; AVX2-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
-; AVX2-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
-; AVX2-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q]], align 2
-; AVX2-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
-; AVX2-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
-; AVX2-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
-; AVX2-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
-; AVX2-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
-; AVX2-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
-; AVX2-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
-; AVX2-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
-; AVX2-NEXT:    ret void
-;
-; AVX512-LABEL: @PR43578_prefer128(
-; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
-; AVX512-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
-; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
-; AVX512-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
-; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
-; AVX512-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
-; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX512-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
-; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX512-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
-; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; AVX512-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
-; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; AVX512-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @PR43578_prefer128(
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
+; CHECK-NEXT:    ret void
 ;
   %p1 = getelementptr inbounds i64, ptr %p, i64 1
   %p2 = getelementptr inbounds i64, ptr %p, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 4ef0e3e5fbfdf..68ed6062e3c40 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefixes=AVX512F
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefixes=AVX512VL
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -15,76 +15,21 @@ target triple = "x86_64-unknown-linux-gnu"
 ; zero-extend the roots back to their original sizes.
 ;
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
-; SSE-LABEL: @PR31243_zext(
-; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
-; SSE-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
-; SSE-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
-; SSE-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; SSE-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; SSE-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; SSE-NEXT:    ret i8 [[TMP_8]]
-;
-; AVX-LABEL: @PR31243_zext(
-; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; AVX-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; AVX-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
-; AVX-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; AVX-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
-; AVX-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
-; AVX-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; AVX-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; AVX-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; AVX-NEXT:    ret i8 [[TMP_8]]
-;
-; AVX2-LABEL: @PR31243_zext(
-; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; AVX2-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; AVX2-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
-; AVX2-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
-; AVX2-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; AVX2-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; AVX2-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; AVX2-NEXT:    ret i8 [[TMP_8]]
-;
-; AVX512F-LABEL: @PR31243_zext(
-; AVX512F-NEXT:  entry:
-; AVX512F-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX512F-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX512F-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; AVX512F-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX512F-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; AVX512F-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
-; AVX512F-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; AVX512F-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; AVX512F-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; AVX512F-NEXT:    ret i8 [[TMP_8]]
-;
-; AVX512VL-LABEL: @PR31243_zext(
-; AVX512VL-NEXT:  entry:
-; AVX512VL-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX512VL-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; AVX512VL-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX512VL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; AVX512VL-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
-; AVX512VL-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; AVX512VL-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; AVX512VL-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; AVX512VL-NEXT:    ret i8 [[TMP_8]]
+; CHECK-LABEL: @PR31243_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; CHECK-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; CHECK-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; CHECK-NEXT:    ret i8 [[TMP_8]]
 ;
 entry:
   %tmp_0 = zext i8 %v0 to i32
@@ -128,64 +73,21 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_sext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; AVX-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; AVX-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; AVX-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
+; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
 ; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
 ; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
 ; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
 ; AVX-NEXT:    ret i8 [[TMP8]]
 ;
-; AVX2-LABEL: @PR31243_sext(
-; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
-; AVX2-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
-; AVX2-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
-; AVX2-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; AVX2-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; AVX2-NEXT:    ret i8 [[TMP8]]
-;
-; AVX512F-LABEL: @PR31243_sext(
-; AVX512F-NEXT:  entry:
-; AVX512F-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; AVX512F-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX512F-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX512F-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
-; AVX512F-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
-; AVX512F-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
-; AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; AVX512F-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; AVX512F-NEXT:    ret i8 [[TMP8]]
-;
-; AVX512VL-LABEL: @PR31243_sext(
-; AVX512VL-NEXT:  entry:
-; AVX512VL-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; AVX512VL-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX512VL-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX512VL-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
-; AVX512VL-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
-; AVX512VL-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
-; AVX512VL-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; AVX512VL-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; AVX512VL-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
-; AVX512VL-NEXT:    ret i8 [[TMP8]]
-;
 entry:
   %tmp0 = sext i8 %v0 to i32
   %tmp1 = sext i8 %v1 to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index 97ca690e15e77..85791827e0894 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -4,29 +4,18 @@
 define void @test(ptr %r, ptr %p, ptr %q) #0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 0
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
 ; CHECK-NEXT:    [[Q0:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 0
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P0]], align 2
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
-; CHECK-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q0]], align 2
-; CHECK-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
-; CHECK-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
-; CHECK-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
-; CHECK-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
-; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
-; CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[Q0]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP7]]
 ; CHECK-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds i64, ptr %p, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
index 0b9ed47ce0f17..b9747b6ae8c89 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -13,30 +13,30 @@ define void @get_block(i32 %y_pos) local_unnamed_addr #0 {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef
 ; CHECK-NEXT:    [[SHR15:%.*]] = ashr i32 [[SUB14]], 2
-; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp sgt i32 [[SHR15]], 0
-; CHECK-NEXT:    [[COND_I_I:%.*]] = select i1 [[CMP_I_I]], i32 [[SHR15]], i32 0
-; CHECK-NEXT:    [[CMP_I4_I:%.*]] = icmp slt i32 [[COND_I_I]], undef
-; CHECK-NEXT:    [[COND_I5_I:%.*]] = select i1 [[CMP_I4_I]], i32 [[COND_I_I]], i32 undef
-; CHECK-NEXT:    [[IDXPROM30:%.*]] = sext i32 [[COND_I5_I]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30]]
-; CHECK-NEXT:    [[CMP_I_I_1:%.*]] = icmp sgt i32 [[SUB14]], -1
-; CHECK-NEXT:    [[COND_I_I_1:%.*]] = select i1 [[CMP_I_I_1]], i32 undef, i32 0
-; CHECK-NEXT:    [[CMP_I4_I_1:%.*]] = icmp slt i32 [[COND_I_I_1]], undef
-; CHECK-NEXT:    [[COND_I5_I_1:%.*]] = select i1 [[CMP_I4_I_1]], i32 [[COND_I_I_1]], i32 undef
-; CHECK-NEXT:    [[IDXPROM30_1:%.*]] = sext i32 [[COND_I5_I_1]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_1]]
-; CHECK-NEXT:    [[CMP_I_I_2:%.*]] = icmp sgt i32 [[SUB14]], -5
-; CHECK-NEXT:    [[COND_I_I_2:%.*]] = select i1 [[CMP_I_I_2]], i32 undef, i32 0
-; CHECK-NEXT:    [[CMP_I4_I_2:%.*]] = icmp slt i32 [[COND_I_I_2]], undef
-; CHECK-NEXT:    [[COND_I5_I_2:%.*]] = select i1 [[CMP_I4_I_2]], i32 [[COND_I_I_2]], i32 undef
-; CHECK-NEXT:    [[IDXPROM30_2:%.*]] = sext i32 [[COND_I5_I_2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_2]]
-; CHECK-NEXT:    [[CMP_I_I_3:%.*]] = icmp sgt i32 [[SUB14]], -9
-; CHECK-NEXT:    [[COND_I_I_3:%.*]] = select i1 [[CMP_I_I_3]], i32 undef, i32 0
-; CHECK-NEXT:    [[CMP_I4_I_3:%.*]] = icmp slt i32 [[COND_I_I_3]], undef
-; CHECK-NEXT:    [[COND_I5_I_3:%.*]] = select i1 [[CMP_I4_I_3]], i32 [[COND_I_I_3]], i32 undef
-; CHECK-NEXT:    [[IDXPROM30_3:%.*]] = sext i32 [[COND_I5_I_3]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[TMP2]], <i32 0, i32 -1, i32 -5, i32 -9>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP19]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:


        


More information about the llvm-commits mailing list