[llvm] 1387a13 - [SLP] Check with target before vectorizing GEP Indices.

Jonas Paulsson via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 23 06:32:30 PST 2023


Author: Jonas Paulsson
Date: 2023-02-23T15:31:34+01:00
New Revision: 1387a13e1d0bac94457626ef3e7427c84caf6e65

URL: https://github.com/llvm/llvm-project/commit/1387a13e1d0bac94457626ef3e7427c84caf6e65
DIFF: https://github.com/llvm/llvm-project/commit/1387a13e1d0bac94457626ef3e7427c84caf6e65.diff

LOG: [SLP] Check with target before vectorizing GEP Indices.

The target hook prefersVectorizedAddressing() already exists to check with
target if address computations should be vectorized, so it seems like this
should be used in SLPVectorizer as well.

Reviewed By: ABataev, RKSimon

Differential Revision: https://reviews.llvm.org/D144128

Added: 
    llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
    llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
    llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
    llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
    llvm/test/Transforms/SLPVectorizer/X86/partail.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8d228238c72c2..7adcf660d483a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5191,6 +5191,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                                           Depth](ArrayRef<Value *> VL) {
     if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
       return false;
+    if (S.getOpcode() == Instruction::GetElementPtr &&
+        !TTI->prefersVectorizedAddressing())
+      return true;
     if (VectorizableTree.size() < MinTreeSize)
       return false;
     if (Depth >= RecursionMaxDepth - 1)
@@ -11873,21 +11876,23 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
       if (!isValidElementType(SI->getValueOperand()->getType()))
         continue;
       Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
+      continue;
     }
 
     // Ignore getelementptr instructions that have more than one index, a
     // constant index, or a pointer operand that doesn't point to a scalar
     // type.
-    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-      auto Idx = GEP->idx_begin()->get();
-      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
-        continue;
-      if (!isValidElementType(Idx->getType()))
-        continue;
-      if (GEP->getType()->isVectorTy())
-        continue;
-      GEPs[GEP->getPointerOperand()].push_back(GEP);
-    }
+    if (TTI->prefersVectorizedAddressing())
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+        auto Idx = GEP->idx_begin()->get();
+        if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+          continue;
+        if (!isValidElementType(Idx->getType()))
+          continue;
+        if (GEP->getType()->isVectorTy())
+          continue;
+        GEPs[GEP->getPointerOperand()].push_back(GEP);
+      }
   }
 }
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index e3ad918498ee7..2faec3047e44a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -12,18 +12,21 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S2]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void
@@ -58,23 +61,25 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
 ; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]]
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]]
 ; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index de9859df3aad9..cf4aba2a61c18 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,59 +36,92 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
-; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
-; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
-; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
-; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
-; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
+; GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
+; GENERIC-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
+; GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
+; GENERIC-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
+; GENERIC-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
+; GENERIC-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
 ; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
-; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
+; GENERIC-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
+; GENERIC-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
+; GENERIC-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
+; GENERIC-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
+; GENERIC-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
-; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
+; GENERIC-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
+; GENERIC-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
+; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
+; GENERIC-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
+; GENERIC-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
-; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
+; GENERIC-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
+; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
+; GENERIC-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
+; GENERIC-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
+; GENERIC-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
-; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
+; GENERIC-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
+; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
+; GENERIC-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
+; GENERIC-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
+; GENERIC-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
-; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
+; GENERIC-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
+; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
+; GENERIC-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
+; GENERIC-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
+; GENERIC-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
-; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
+; GENERIC-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
+; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
+; GENERIC-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
+; GENERIC-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
+; GENERIC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
-; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
+; GENERIC-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
+; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
+; GENERIC-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
+; GENERIC-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
+; GENERIC-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -109,59 +142,92 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
-; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
-; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
-; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
-; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
-; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
-; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
-; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
+; KRYO-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
+; KRYO-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
+; KRYO-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
+; KRYO-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
+; KRYO-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
+; KRYO-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
 ; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
-; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
+; KRYO-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
+; KRYO-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
+; KRYO-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
+; KRYO-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
+; KRYO-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
+; KRYO-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
-; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
+; KRYO-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
+; KRYO-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
+; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
+; KRYO-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
+; KRYO-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
+; KRYO-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
-; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
+; KRYO-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
+; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
+; KRYO-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
+; KRYO-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
+; KRYO-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
-; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
+; KRYO-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
+; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
+; KRYO-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
+; KRYO-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
+; KRYO-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
-; KRYO-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
+; KRYO-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
+; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
+; KRYO-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
+; KRYO-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
+; KRYO-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
-; KRYO-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
+; KRYO-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
+; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
+; KRYO-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
+; KRYO-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
+; KRYO-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
-; KRYO-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
+; KRYO-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
+; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
+; KRYO-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
+; KRYO-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
+; KRYO-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -293,59 +359,92 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
-; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
-; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
-; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
-; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
-; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
+; GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
+; GENERIC-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
+; GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
+; GENERIC-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
+; GENERIC-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
+; GENERIC-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
 ; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
-; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
+; GENERIC-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
+; GENERIC-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
+; GENERIC-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
+; GENERIC-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
+; GENERIC-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
-; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
+; GENERIC-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
+; GENERIC-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
+; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
+; GENERIC-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
+; GENERIC-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
-; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
+; GENERIC-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
+; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
+; GENERIC-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
+; GENERIC-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
+; GENERIC-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
-; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
+; GENERIC-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
+; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
+; GENERIC-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
+; GENERIC-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
+; GENERIC-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
-; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
+; GENERIC-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
+; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
+; GENERIC-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
+; GENERIC-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
+; GENERIC-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
-; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
+; GENERIC-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
+; GENERIC-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
+; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
+; GENERIC-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
+; GENERIC-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
+; GENERIC-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
-; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
+; GENERIC-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
+; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
+; GENERIC-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
+; GENERIC-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
+; GENERIC-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
@@ -366,59 +465,92 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
-; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
-; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
-; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
-; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
-; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
-; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
-; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP8]]
-; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 1
+; KRYO-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR_0101]], align 2
+; KRYO-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i64 1
+; KRYO-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B]], align 2
+; KRYO-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i64
+; KRYO-NEXT:    [[SUB:%.*]] = sub nsw i64 [[CONV]], [[CONV2]]
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[SUB]]
+; KRYO-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP2]] to i32
 ; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
-; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP11]]
-; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
-; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 2
+; KRYO-NEXT:    [[TMP3:%.*]] = load i16, ptr [[INCDEC_PTR]], align 2
+; KRYO-NEXT:    [[CONV5:%.*]] = zext i16 [[TMP3]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 2
+; KRYO-NEXT:    [[TMP4:%.*]] = load i16, ptr [[INCDEC_PTR1]], align 2
+; KRYO-NEXT:    [[CONV7:%.*]] = zext i16 [[TMP4]] to i64
+; KRYO-NEXT:    [[SUB8:%.*]] = sub nsw i64 [[CONV5]], [[CONV7]]
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB8]]
+; KRYO-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP5]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
-; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP14]]
-; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
-; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR13:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 3
+; KRYO-NEXT:    [[TMP6:%.*]] = load i16, ptr [[INCDEC_PTR4]], align 2
+; KRYO-NEXT:    [[CONV14:%.*]] = zext i16 [[TMP6]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR15:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 3
+; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[INCDEC_PTR6]], align 2
+; KRYO-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP7]] to i64
+; KRYO-NEXT:    [[SUB17:%.*]] = sub nsw i64 [[CONV14]], [[CONV16]]
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB17]]
+; KRYO-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP8]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
-; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP17]]
-; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
-; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR22:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 4
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, ptr [[INCDEC_PTR13]], align 2
+; KRYO-NEXT:    [[CONV23:%.*]] = zext i16 [[TMP9]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR24:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 4
+; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[INCDEC_PTR15]], align 2
+; KRYO-NEXT:    [[CONV25:%.*]] = zext i16 [[TMP10]] to i64
+; KRYO-NEXT:    [[SUB26:%.*]] = sub nsw i64 [[CONV23]], [[CONV25]]
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB26]]
+; KRYO-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP11]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
-; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP20]]
-; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
-; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR31:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 5
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, ptr [[INCDEC_PTR22]], align 2
+; KRYO-NEXT:    [[CONV32:%.*]] = zext i16 [[TMP12]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 5
+; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[INCDEC_PTR24]], align 2
+; KRYO-NEXT:    [[CONV34:%.*]] = zext i16 [[TMP13]] to i64
+; KRYO-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[CONV32]], [[CONV34]]
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB35]]
+; KRYO-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP14]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
-; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP23]]
-; KRYO-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
-; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR40:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 6
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, ptr [[INCDEC_PTR31]], align 2
+; KRYO-NEXT:    [[CONV41:%.*]] = zext i16 [[TMP15]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR42:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 6
+; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[INCDEC_PTR33]], align 2
+; KRYO-NEXT:    [[CONV43:%.*]] = zext i16 [[TMP16]] to i64
+; KRYO-NEXT:    [[SUB44:%.*]] = sub nsw i64 [[CONV41]], [[CONV43]]
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB44]]
+; KRYO-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP17]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
-; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP26]]
-; KRYO-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
-; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR49:%.*]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 7
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, ptr [[INCDEC_PTR40]], align 2
+; KRYO-NEXT:    [[CONV50:%.*]] = zext i16 [[TMP18]] to i64
+; KRYO-NEXT:    [[INCDEC_PTR51:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 7
+; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[INCDEC_PTR42]], align 2
+; KRYO-NEXT:    [[CONV52:%.*]] = zext i16 [[TMP19]] to i64
+; KRYO-NEXT:    [[SUB53:%.*]] = sub nsw i64 [[CONV50]], [[CONV52]]
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB53]]
+; KRYO-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP20]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
-; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP29]]
-; KRYO-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
-; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, ptr [[INCDEC_PTR49]], align 2
+; KRYO-NEXT:    [[CONV59:%.*]] = zext i16 [[TMP21]] to i64
+; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[INCDEC_PTR51]], align 2
+; KRYO-NEXT:    [[CONV61:%.*]] = zext i16 [[TMP22]] to i64
+; KRYO-NEXT:    [[SUB62:%.*]] = sub nsw i64 [[CONV59]], [[CONV61]]
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[SUB62]]
+; KRYO-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP23]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
 ; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
 ; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index a567b6c71e898..5a4d7086c9b1b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -24,25 +24,25 @@ target triple = "aarch64--linux-gnu"
 ;
 
 ; YAML-LABEL: Function:        getelementptr_4x32
-; YAML:      --- !Passed
+; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '-7'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '7'
 
-; YAML:      --- !Passed
+; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '-7'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '7'
 
 define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @getelementptr_4x32(
@@ -50,9 +50,6 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -63,28 +60,23 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
+; CHECK-NEXT:    [[T7:%.*]] = add nsw i32 [[T4]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[T7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
+; CHECK-NEXT:    [[T9:%.*]] = add nsw i32 [[T4]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[T9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
+; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[T11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -131,15 +123,15 @@ for.body:
 }
 
 ; YAML-LABEL: Function:        getelementptr_2x32
-; YAML:      --- !Passed
+; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        getelementptr_2x32
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '3'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '-7'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '7'
 
 define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @getelementptr_2x32(
@@ -147,8 +139,6 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -159,26 +149,23 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[T4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
 ; CHECK-NEXT:    [[T7:%.*]] = or i32 [[T4]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[T7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[T7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
+; CHECK-NEXT:    [[T9:%.*]] = add nsw i32 [[T4]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[T9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP10]]
+; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[T11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -232,44 +219,84 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) {
 ; CHECK-LABEL: @test_i16_extend(
 ; CHECK-NEXT:    [[P_0:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_0]], 1
+; CHECK-NEXT:    [[IDX_2:%.*]] = add nuw nsw i64 [[IDX_0]], 2
+; CHECK-NEXT:    [[IDX_3:%.*]] = add nuw nsw i64 [[IDX_0]], 3
+; CHECK-NEXT:    [[IDX_4:%.*]] = add nuw nsw i64 [[IDX_0]], 4
+; CHECK-NEXT:    [[IDX_5:%.*]] = add nuw nsw i64 [[IDX_0]], 5
+; CHECK-NEXT:    [[IDX_6:%.*]] = add nuw nsw i64 [[IDX_0]], 6
+; CHECK-NEXT:    [[IDX_7:%.*]] = add nuw nsw i64 [[IDX_0]], 7
 ; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
+; CHECK-NEXT:    [[OP1_L:%.*]] = load i16, ptr [[T53]], align 2
+; CHECK-NEXT:    [[OP1_EXT:%.*]] = zext i16 [[OP1_L]] to i64
 ; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[T53]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[T56]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
+; CHECK-NEXT:    [[OP2_L:%.*]] = load i16, ptr [[T56]], align 2
+; CHECK-NEXT:    [[OP2_EXT:%.*]] = zext i16 [[OP2_L]] to i64
+; CHECK-NEXT:    [[SUB_1:%.*]] = sub nsw i64 [[OP1_EXT]], [[OP2_EXT]]
+; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_1]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[T60]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
+; CHECK-NEXT:    [[T64:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_1]]
+; CHECK-NEXT:    [[T65:%.*]] = load i16, ptr [[T64]], align 2
+; CHECK-NEXT:    [[T66:%.*]] = zext i16 [[T65]] to i64
+; CHECK-NEXT:    [[T67:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_1]]
+; CHECK-NEXT:    [[T68:%.*]] = load i16, ptr [[T67]], align 2
+; CHECK-NEXT:    [[T69:%.*]] = zext i16 [[T68]] to i64
+; CHECK-NEXT:    [[SUB_2:%.*]] = sub nsw i64 [[T66]], [[T69]]
+; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_2]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[T71]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP7]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
+; CHECK-NEXT:    [[T75:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_2]]
+; CHECK-NEXT:    [[T76:%.*]] = load i16, ptr [[T75]], align 2
+; CHECK-NEXT:    [[T77:%.*]] = zext i16 [[T76]] to i64
+; CHECK-NEXT:    [[T78:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_2]]
+; CHECK-NEXT:    [[T79:%.*]] = load i16, ptr [[T78]], align 2
+; CHECK-NEXT:    [[T80:%.*]] = zext i16 [[T79]] to i64
+; CHECK-NEXT:    [[SUB_3:%.*]] = sub nsw i64 [[T77]], [[T80]]
+; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_3]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[T82]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP7]], i64 3
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
+; CHECK-NEXT:    [[T86:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_3]]
+; CHECK-NEXT:    [[T87:%.*]] = load i16, ptr [[T86]], align 2
+; CHECK-NEXT:    [[T88:%.*]] = zext i16 [[T87]] to i64
+; CHECK-NEXT:    [[T89:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_3]]
+; CHECK-NEXT:    [[T90:%.*]] = load i16, ptr [[T89]], align 2
+; CHECK-NEXT:    [[T91:%.*]] = zext i16 [[T90]] to i64
+; CHECK-NEXT:    [[SUB_4:%.*]] = sub nsw i64 [[T88]], [[T91]]
+; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_4]]
 ; CHECK-NEXT:    [[L_4:%.*]] = load i32, ptr [[T93]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP7]], i64 4
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
+; CHECK-NEXT:    [[T97:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_4]]
+; CHECK-NEXT:    [[T98:%.*]] = load i16, ptr [[T97]], align 2
+; CHECK-NEXT:    [[T99:%.*]] = zext i16 [[T98]] to i64
+; CHECK-NEXT:    [[T100:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_4]]
+; CHECK-NEXT:    [[T101:%.*]] = load i16, ptr [[T100]], align 2
+; CHECK-NEXT:    [[T102:%.*]] = zext i16 [[T101]] to i64
+; CHECK-NEXT:    [[SUB_5:%.*]] = sub nsw i64 [[T99]], [[T102]]
+; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_5]]
 ; CHECK-NEXT:    [[L_5:%.*]] = load i32, ptr [[T104]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP7]], i64 5
-; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
+; CHECK-NEXT:    [[T108:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_5]]
+; CHECK-NEXT:    [[T109:%.*]] = load i16, ptr [[T108]], align 2
+; CHECK-NEXT:    [[T110:%.*]] = zext i16 [[T109]] to i64
+; CHECK-NEXT:    [[T111:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_5]]
+; CHECK-NEXT:    [[T112:%.*]] = load i16, ptr [[T111]], align 2
+; CHECK-NEXT:    [[T113:%.*]] = zext i16 [[T112]] to i64
+; CHECK-NEXT:    [[SUB_6:%.*]] = sub nsw i64 [[T110]], [[T113]]
+; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_6]]
 ; CHECK-NEXT:    [[L_6:%.*]] = load i32, ptr [[T115]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP7]], i64 6
-; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
+; CHECK-NEXT:    [[T119:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_6]]
+; CHECK-NEXT:    [[T120:%.*]] = load i16, ptr [[T119]], align 2
+; CHECK-NEXT:    [[T121:%.*]] = zext i16 [[T120]] to i64
+; CHECK-NEXT:    [[T122:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_6]]
+; CHECK-NEXT:    [[T123:%.*]] = load i16, ptr [[T122]], align 2
+; CHECK-NEXT:    [[T124:%.*]] = zext i16 [[T123]] to i64
+; CHECK-NEXT:    [[SUB_7:%.*]] = sub nsw i64 [[T121]], [[T124]]
+; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_7]]
 ; CHECK-NEXT:    [[L_7:%.*]] = load i32, ptr [[T126]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP7]], i64 7
-; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP23]]
+; CHECK-NEXT:    [[T130:%.*]] = getelementptr inbounds i16, ptr [[P_1]], i64 [[IDX_7]]
+; CHECK-NEXT:    [[T131:%.*]] = load i16, ptr [[T130]], align 2
+; CHECK-NEXT:    [[T132:%.*]] = zext i16 [[T131]] to i64
+; CHECK-NEXT:    [[T133:%.*]] = getelementptr inbounds i16, ptr [[P_2]], i64 [[IDX_7]]
+; CHECK-NEXT:    [[T134:%.*]] = load i16, ptr [[T133]], align 2
+; CHECK-NEXT:    [[T135:%.*]] = zext i16 [[T134]] to i64
+; CHECK-NEXT:    [[SUB_8:%.*]] = sub nsw i64 [[T132]], [[T135]]
+; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[SUB_8]]
 ; CHECK-NEXT:    [[L_8:%.*]] = load i32, ptr [[T137]], align 4
 ; CHECK-NEXT:    call void @use(i32 [[L_1]], i32 [[L_2]], i32 [[L_3]], i32 [[L_4]], i32 [[L_5]], i32 [[L_6]], i32 [[L_7]], i32 [[L_8]])
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll
new file mode 100644
index 0000000000000..348fde129c855
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/gep-indices.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \
+; RUN:   | FileCheck %s
+;
+; Test that gep indices are not first vectorized and then extracted (into address registers).
+
+%StructTy = type { i8, i64, i64, i64, i64 }
+declare void @bar(ptr, ptr)
+
+define void @fun(ptr %Addr) {
+; CHECK-LABEL: @fun(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[P2472:%.*]] = getelementptr inbounds [[STRUCTTY:%.*]], ptr [[ADDR:%.*]], i64 [[INDVARS_IV]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[P2472]], align 8
+; CHECK-NEXT:    [[P3476:%.*]] = getelementptr inbounds [[STRUCTTY]], ptr [[ADDR]], i64 [[INDVARS_IV]], i32 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[P3476]], align 8
+; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[TMP0]], 32
+; CHECK-NEXT:    [[IDXPROM495:%.*]] = ashr exact i64 [[SEXT]], 32
+; CHECK-NEXT:    [[ARRAYIDX496:%.*]] = getelementptr inbounds [3 x float], ptr null, i64 [[IDXPROM495]]
+; CHECK-NEXT:    [[SEXT4:%.*]] = shl i64 [[TMP1]], 32
+; CHECK-NEXT:    [[IDXPROM499:%.*]] = ashr exact i64 [[SEXT4]], 32
+; CHECK-NEXT:    [[ARRAYIDX500:%.*]] = getelementptr inbounds [3 x float], ptr null, i64 [[IDXPROM499]]
+; CHECK-NEXT:    tail call void @bar(ptr noundef poison, ptr noundef [[ARRAYIDX500]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond ], [ 0, %entry ]
+  %P2472 = getelementptr inbounds %StructTy, ptr %Addr, i64 %indvars.iv, i32 3
+  %0 = load i64, ptr %P2472, align 8
+  %P3476 = getelementptr inbounds %StructTy, ptr %Addr, i64 %indvars.iv, i32 4
+  %1 = load i64, ptr %P3476, align 8
+  %sext = shl i64 %0, 32
+  %idxprom495 = ashr exact i64 %sext, 32
+  %arrayidx496 = getelementptr inbounds [3 x float], ptr null, i64 %idxprom495
+  %sext4 = shl i64 %1, 32
+  %idxprom499 = ashr exact i64 %sext4, 32
+  %arrayidx500 = getelementptr inbounds [3 x float], ptr null, i64 %idxprom499
+  tail call void @bar(ptr noundef poison, ptr noundef %arrayidx500)
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond
+}

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 5b0ecdb779d23..69eded4e96740 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -134,24 +134,49 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 }
 
 define void @PR43578_prefer128(ptr %r, ptr %p, ptr %q) #0 {
-; CHECK-LABEL: @PR43578_prefer128(
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
-; CHECK-NEXT:    ret void
+; AVX2-LABEL: @PR43578_prefer128(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
+; AVX2-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 1
+; AVX2-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
+; AVX2-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
+; AVX2-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 2
+; AVX2-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
+; AVX2-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
+; AVX2-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
+; AVX2-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q]], align 2
+; AVX2-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
+; AVX2-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
+; AVX2-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
+; AVX2-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
+; AVX2-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
+; AVX2-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
+; AVX2-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
+; AVX2-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @PR43578_prefer128(
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
+; AVX512-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
+; AVX512-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; AVX512-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; AVX512-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
+; AVX512-NEXT:    ret void
 ;
   %p1 = getelementptr inbounds i64, ptr %p, i64 1
   %p2 = getelementptr inbounds i64, ptr %p, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 49b88f85968a7..c7f2e174986b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -134,24 +134,49 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x)
 }
 
 define void @PR43578_prefer128(ptr %r, ptr %p, ptr %q) #0 {
-; CHECK-LABEL: @PR43578_prefer128(
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
-; CHECK-NEXT:    ret void
+; AVX2-LABEL: @PR43578_prefer128(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
+; AVX2-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 1
+; AVX2-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
+; AVX2-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
+; AVX2-NEXT:    [[X0:%.*]] = load i64, ptr [[P]], align 2
+; AVX2-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
+; AVX2-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
+; AVX2-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
+; AVX2-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q]], align 2
+; AVX2-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
+; AVX2-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
+; AVX2-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
+; AVX2-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
+; AVX2-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
+; AVX2-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
+; AVX2-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
+; AVX2-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @PR43578_prefer128(
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 2
+; AVX512-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 2
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[P]], align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[Q]], align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[P2]], align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[Q2]], align 2
+; AVX512-NEXT:    [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]]
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP7]]
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP8]]
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; AVX512-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP9]]
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; AVX512-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP10]]
+; AVX512-NEXT:    ret void
 ;
   %p1 = getelementptr inbounds i64, ptr %p, i64 1
   %p2 = getelementptr inbounds i64, ptr %p, i64 2

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 68ed6062e3c40..4ef0e3e5fbfdf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefixes=AVX512F
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefixes=AVX512VL
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -15,21 +15,76 @@ target triple = "x86_64-unknown-linux-gnu"
 ; zero-extend the roots back to their original sizes.
 ;
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
-; CHECK-LABEL: @PR31243_zext(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
-; CHECK-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
-; CHECK-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
-; CHECK-NEXT:    ret i8 [[TMP_8]]
+; SSE-LABEL: @PR31243_zext(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; SSE-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; SSE-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; SSE-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; SSE-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; SSE-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; SSE-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; SSE-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; SSE-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; SSE-NEXT:    ret i8 [[TMP_8]]
+;
+; AVX-LABEL: @PR31243_zext(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; AVX-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; AVX-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; AVX-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; AVX-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; AVX-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; AVX-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; AVX-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; AVX-NEXT:    ret i8 [[TMP_8]]
+;
+; AVX2-LABEL: @PR31243_zext(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; AVX2-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i64
+; AVX2-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i64
+; AVX2-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; AVX2-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; AVX2-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; AVX2-NEXT:    ret i8 [[TMP_8]]
+;
+; AVX512F-LABEL: @PR31243_zext(
+; AVX512F-NEXT:  entry:
+; AVX512F-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX512F-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; AVX512F-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; AVX512F-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; AVX512F-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; AVX512F-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; AVX512F-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX512F-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; AVX512F-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; AVX512F-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; AVX512F-NEXT:    ret i8 [[TMP_8]]
+;
+; AVX512VL-LABEL: @PR31243_zext(
+; AVX512VL-NEXT:  entry:
+; AVX512VL-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX512VL-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
+; AVX512VL-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; AVX512VL-NEXT:    [[TMP_4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
+; AVX512VL-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; AVX512VL-NEXT:    [[TMP_5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX512VL-NEXT:    [[TMP_6:%.*]] = load i8, ptr [[TMP_4]], align 1
+; AVX512VL-NEXT:    [[TMP_7:%.*]] = load i8, ptr [[TMP_5]], align 1
+; AVX512VL-NEXT:    [[TMP_8:%.*]] = add i8 [[TMP_6]], [[TMP_7]]
+; AVX512VL-NEXT:    ret i8 [[TMP_8]]
 ;
 entry:
   %tmp_0 = zext i8 %v0 to i32
@@ -73,21 +128,64 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_sext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
-; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
-; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
-; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
-; AVX-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
-; AVX-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
+; AVX-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; AVX-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; AVX-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; AVX-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
 ; AVX-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
 ; AVX-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
 ; AVX-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
 ; AVX-NEXT:    ret i8 [[TMP8]]
 ;
+; AVX2-LABEL: @PR31243_sext(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = or i8 [[V0:%.*]], 1
+; AVX2-NEXT:    [[TMP1:%.*]] = or i8 [[V1:%.*]], 1
+; AVX2-NEXT:    [[TMP2:%.*]] = sext i8 [[TMP0]] to i64
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = sext i8 [[TMP1]] to i64
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; AVX2-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX2-NEXT:    ret i8 [[TMP8]]
+;
+; AVX512F-LABEL: @PR31243_sext(
+; AVX512F-NEXT:  entry:
+; AVX512F-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; AVX512F-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX512F-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX512F-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
+; AVX512F-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
+; AVX512F-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
+; AVX512F-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
+; AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; AVX512F-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX512F-NEXT:    ret i8 [[TMP8]]
+;
+; AVX512VL-LABEL: @PR31243_sext(
+; AVX512VL-NEXT:  entry:
+; AVX512VL-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
+; AVX512VL-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; AVX512VL-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; AVX512VL-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i64 0
+; AVX512VL-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; AVX512VL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP5]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i64 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP7]]
+; AVX512VL-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; AVX512VL-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; AVX512VL-NEXT:    ret i8 [[TMP8]]
+;
 entry:
   %tmp0 = sext i8 %v0 to i32
   %tmp1 = sext i8 %v1 to i32

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index 85791827e0894..97ca690e15e77 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -4,18 +4,29 @@
 define void @test(ptr %r, ptr %p, ptr %q) #0 {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
 ; CHECK-NEXT:    [[Q0:%.*]] = getelementptr inbounds i64, ptr [[Q:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[P0]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[Q0]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[TMP7]]
+; CHECK-NEXT:    [[Q1:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 1
+; CHECK-NEXT:    [[Q2:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 2
+; CHECK-NEXT:    [[Q3:%.*]] = getelementptr inbounds i64, ptr [[Q]], i64 3
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[P0]], align 2
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[P1]], align 2
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[P2]], align 2
+; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[P3]], align 2
+; CHECK-NEXT:    [[Y0:%.*]] = load i64, ptr [[Q0]], align 2
+; CHECK-NEXT:    [[Y1:%.*]] = load i64, ptr [[Q1]], align 2
+; CHECK-NEXT:    [[Y2:%.*]] = load i64, ptr [[Q2]], align 2
+; CHECK-NEXT:    [[Y3:%.*]] = load i64, ptr [[Q3]], align 2
+; CHECK-NEXT:    [[SUB0:%.*]] = sub nsw i64 [[X0]], [[Y0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i64 [[X1]], [[Y1]]
+; CHECK-NEXT:    [[SUB2:%.*]] = sub nsw i64 [[X2]], [[Y2]]
+; CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i64 [[X3]], [[Y3]]
+; CHECK-NEXT:    [[G0:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[SUB0]]
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB1]]
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB2]]
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[SUB3]]
 ; CHECK-NEXT:    ret void
 ;
   %p0 = getelementptr inbounds i64, ptr %p, i64 0

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
index dc61cd551b6f7..0b9ed47ce0f17 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -13,28 +13,30 @@ define void @get_block(i32 %y_pos) local_unnamed_addr #0 {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef
 ; CHECK-NEXT:    [[SHR15:%.*]] = ashr i32 [[SUB14]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[SHR15]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[SUB14]], i32 1
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], <i32 0, i32 -1, i32 -5, i32 -9>
-; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x i32> [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
-; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
-; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[TMP16]]
+; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp sgt i32 [[SHR15]], 0
+; CHECK-NEXT:    [[COND_I_I:%.*]] = select i1 [[CMP_I_I]], i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[CMP_I4_I:%.*]] = icmp slt i32 [[COND_I_I]], undef
+; CHECK-NEXT:    [[COND_I5_I:%.*]] = select i1 [[CMP_I4_I]], i32 [[COND_I_I]], i32 undef
+; CHECK-NEXT:    [[IDXPROM30:%.*]] = sext i32 [[COND_I5_I]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30]]
+; CHECK-NEXT:    [[CMP_I_I_1:%.*]] = icmp sgt i32 [[SUB14]], -1
+; CHECK-NEXT:    [[COND_I_I_1:%.*]] = select i1 [[CMP_I_I_1]], i32 undef, i32 0
+; CHECK-NEXT:    [[CMP_I4_I_1:%.*]] = icmp slt i32 [[COND_I_I_1]], undef
+; CHECK-NEXT:    [[COND_I5_I_1:%.*]] = select i1 [[CMP_I4_I_1]], i32 [[COND_I_I_1]], i32 undef
+; CHECK-NEXT:    [[IDXPROM30_1:%.*]] = sext i32 [[COND_I5_I_1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_1]]
+; CHECK-NEXT:    [[CMP_I_I_2:%.*]] = icmp sgt i32 [[SUB14]], -5
+; CHECK-NEXT:    [[COND_I_I_2:%.*]] = select i1 [[CMP_I_I_2]], i32 undef, i32 0
+; CHECK-NEXT:    [[CMP_I4_I_2:%.*]] = icmp slt i32 [[COND_I_I_2]], undef
+; CHECK-NEXT:    [[COND_I5_I_2:%.*]] = select i1 [[CMP_I4_I_2]], i32 [[COND_I_I_2]], i32 undef
+; CHECK-NEXT:    [[IDXPROM30_2:%.*]] = sext i32 [[COND_I5_I_2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_2]]
+; CHECK-NEXT:    [[CMP_I_I_3:%.*]] = icmp sgt i32 [[SUB14]], -9
+; CHECK-NEXT:    [[COND_I_I_3:%.*]] = select i1 [[CMP_I_I_3]], i32 undef, i32 0
+; CHECK-NEXT:    [[CMP_I4_I_3:%.*]] = icmp slt i32 [[COND_I_I_3]], undef
+; CHECK-NEXT:    [[COND_I5_I_3:%.*]] = select i1 [[CMP_I4_I_3]], i32 [[COND_I_I_3]], i32 undef
+; CHECK-NEXT:    [[IDXPROM30_3:%.*]] = sext i32 [[COND_I5_I_3]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds ptr, ptr undef, i64 [[IDXPROM30_3]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:


        


More information about the llvm-commits mailing list