[llvm] a8fe120 - [LV] Enable the LoopVectorizer to create pointer inductions

Thu Jul 2 03:39:38 PDT 2020

Author: Anna Welker
Date: 2020-07-02T11:39:28+01:00
New Revision: a8fe12065ec8137e55a6a8b35dd5355477c2ac16

URL: https://github.com/llvm/llvm-project/commit/a8fe12065ec8137e55a6a8b35dd5355477c2ac16
DIFF: https://github.com/llvm/llvm-project/commit/a8fe12065ec8137e55a6a8b35dd5355477c2ac16.diff

LOG: [LV] Enable the LoopVectorizer to create pointer inductions

This patch enables the LoopVectorizer to build a phi of pointer
type and provide the vector loads and stores with vector type
getelementptrs built from the pointer induction variable, which
produces much less instructions than the previous approach of
creating scalar getelementpointers and glue them together to a
vector.

Differential Revision: https://reviews.llvm.org/D81267

Added: 
    llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bfb4f29d7112..0c96842480c1 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4201,26 +4201,66 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
-    // This is the normalized GEP that starts counting at zero.
-    Value *PtrInd = Induction;
-    PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
-    // Determine the number of scalars we need to generate for each unroll
-    // iteration. If the instruction is uniform, we only need to generate the
-    // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
-    // These are the scalar results. Notice that we don't generate vector GEPs
-    // because scalar GEPs result in better code.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-        Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
-        Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
-        Value *SclrGep =
-            emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
-        SclrGep->setName("next.gep");
-        VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+
+    if (Cost->isScalarAfterVectorization(P, VF)) {
+      // This is the normalized GEP that starts counting at zero.
+      Value *PtrInd =
+          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+      // Determine the number of scalars we need to generate for each unroll
+      // iteration. If the instruction is uniform, we only need to generate the
+      // first lane. Otherwise, we generate all VF values.
+      unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+          Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
+          Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
+          Value *SclrGep =
+              emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
+          SclrGep->setName("next.gep");
+          VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
+        }
       }
+      return;
+    }
+    assert(isa<SCEVConstant>(II.getStep()) &&
+           "Induction step not a SCEV constant!");
+    Type *PhiType = II.getStep()->getType();
+
+    // Build a pointer phi
+    Value *ScalarStartValue = II.getStartValue();
+    Type *ScStValueType = ScalarStartValue->getType();
+    PHINode *NewPointerPhi =
+        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+    NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
+
+    // A pointer induction, performed by using a gep
+    const SCEV *ScalarStep = II.getStep();
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Value *ScalarStepValue =
+        Exp.expandCodeFor(ScalarStep, PhiType, &*Builder.GetInsertPoint());
+    Value *InductionGEP = Builder.CreateGEP(
+        ScStValueType->getPointerElementType(), NewPointerPhi,
+        Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)));
+    NewPointerPhi->addIncoming(InductionGEP,
+                               cast<Instruction>(InductionGEP)->getParent());
+
+    // Create UF many actual address geps that use the pointer
+    // phi as base and a vectorized version of the step value
+    // (<step*0, ..., step*N>) as offset.
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Constant *, 8> Indices;
+      // Create a vector of consecutive numbers from zero to VF.
+      for (unsigned i = 0; i < VF; ++i)
+        Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
+      Constant *StartOffset = ConstantVector::get(Indices);
+
+      Value *GEP = Builder.CreateGEP(
+          ScStValueType->getPointerElementType(), NewPointerPhi,
+          Builder.CreateMul(StartOffset,
+                            Builder.CreateVectorSplat(VF, ScalarStepValue),
+                            "vector.gep"));
+      VectorLoopValueMap.setVectorValue(P, Part, GEP);
     }
-    return;
   }
   }
 }
@@ -4456,6 +4496,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // accesses that will remain scalar.
   SmallSetVector<Instruction *, 8> ScalarPtrs;
   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+  auto *Latch = TheLoop->getLoopLatch();
 
   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
   // The pointer operands of loads and stores will be scalar as long as the
@@ -4481,11 +4522,33 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
            !TheLoop->isLoopInvariant(V);
   };
 
-  // A helper that evaluates a memory access's use of a pointer. If the use
-  // will be a scalar use, and the pointer is only used by memory accesses, we
-  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
-  // PossibleNonScalarPtrs.
+  auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
+    if (!isa<PHINode>(Ptr) ||
+        !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
+      return false;
+    auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
+    if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
+      return false;
+    return isScalarUse(MemAccess, Ptr);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the
+  // pointer is actually the pointer induction of a loop, it is being
+  // inserted into Worklist. If the use will be a scalar use, and the
+  // pointer is only used by memory accesses, we place the pointer in
+  // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+    if (isScalarPtrInduction(MemAccess, Ptr)) {
+      Worklist.insert(cast<Instruction>(Ptr));
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      Worklist.insert(Update);
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
+                        << "\n");
+      return;
+    }
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
     if (!isLoopVaryingBitCastOrGEP(Ptr))
@@ -4509,10 +4572,9 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   };
 
   // We seed the scalars analysis with three classes of instructions: (1)
-  // instructions marked uniform-after-vectorization, (2) bitcast and
-  // getelementptr instructions used by memory accesses requiring a scalar use,
-  // and (3) pointer induction variables and their update instructions (we
-  // currently only scalarize these).
+  // instructions marked uniform-after-vectorization and (2) bitcast,
+  // getelementptr and (pointer) phi instructions used by memory accesses
+  // requiring a scalar use.
   //
   // (1) Add to the worklist all instructions that have been identified as
   // uniform-after-vectorization.
@@ -4538,24 +4600,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
       Worklist.insert(I);
     }
 
-  // (3) Add to the worklist all pointer induction variables and their update
-  // instructions.
-  //
-  // TODO: Once we are able to vectorize pointer induction variables we should
-  //       no longer insert them into the worklist here.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : Legal->getInductionVars()) {
-    auto *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
-      continue;
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
-                      << "\n");
-  }
-
   // Insert the forced scalars.
   // FIXME: Currently widenPHIInstruction() often creates a dead vector
   // induction variable when the PHI user is scalarized.
@@ -4591,14 +4635,6 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
-    // We already considered pointer induction variables, so there's no reason
-    // to look at their users again.
-    //
-    // TODO: Once we are able to vectorize pointer induction variables we
-    //       should no longer skip over them here.
-    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
-      continue;
-
     // If tail-folding is applied, the primary induction variable will be used
     // to feed a vector compare.
     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
new file mode 100644
index 000000000000..daeac07f33e1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -0,0 +1,972 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -dce -instcombine --simplifycfg -enable-arm-maskedgatscat < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define hidden void @pointer_phi_v4i32_add1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %s, i32%y) {
+; CHECK-LABEL: @pointer_phi_v4i32_add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load i32, i32* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 1
+  %add = add nsw i32 %0, %y
+  store i32 %add, i32* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4i32_add2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v4i32_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 1992
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 996
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[A]], i32 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[Y]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !3
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load i32, i32* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 2
+  %add = add nsw i32 %0, %y
+  store i32 %add, i32* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4i32_add3(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v4i32_add3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 2988
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 996
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i32, i32* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[Y]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !6
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load i32, i32* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 3
+  %add = add nsw i32 %0, %y
+  store i32 %add, i32* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v8i16_add1(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v8i16_add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP4]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* [[TMP3]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i16
+  br label %for.body
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %l1 = load i16, i16* %A.addr.011, align 2
+  %add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 1
+  %conv1 = add i16 %l1, %0
+  store i16 %conv1, i16* %B.addr.09, align 2
+  %incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v8i16_add2(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v8i16_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1984
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 992
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[A]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP]] to <16 x i16>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i16>, <16 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[NEXT_GEP4]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* [[TMP4]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_011:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_09:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[L1:%.*]] = load i16, i16* [[A_ADDR_011]], align 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, i16* [[A_ADDR_011]], i32 2
+; CHECK-NEXT:    [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
+; CHECK-NEXT:    store i16 [[CONV1]], i16* [[B_ADDR_09]], align 2
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[B_ADDR_09]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !9
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i16
+  br label %for.body
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %l1 = load i16, i16* %A.addr.011, align 2
+  %add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 2
+  %conv1 = add i16 %l1, %0
+  store i16 %conv1, i16* %B.addr.09, align 2
+  %incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v8i16_add3(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v8i16_add3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i16
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_011:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_09:%.*]] = phi i16* [ [[B:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[L1:%.*]] = load i16, i16* [[A_ADDR_011]], align 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, i16* [[A_ADDR_011]], i32 3
+; CHECK-NEXT:    [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
+; CHECK-NEXT:    store i16 [[CONV1]], i16* [[B_ADDR_09]], align 2
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[B_ADDR_09]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i16
+  br label %for.body
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %l1 = load i16, i16* %A.addr.011, align 2
+  %add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 3
+  %conv1 = add i16 %l1, %0
+  store i16 %conv1, i16* %B.addr.09, align 2
+  %incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v16i8_add1(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v16i8_add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 992
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 992
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP4]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_010:%.*]] = phi i8* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, i8* [[A_ADDR_010]], align 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 1
+; CHECK-NEXT:    [[CONV1:%.*]] = add i8 [[TMP5]], [[TMP0]]
+; CHECK-NEXT:    store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i8
+  br label %for.body
+
+for.body:
+  %A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %1 = load i8, i8* %A.addr.010, align 1
+  %add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 1
+  %conv1 = add i8 %1, %0
+  store i8 %conv1, i8* %B.addr.08, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v16i8_add2(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v16i8_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 1984
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 992
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[A]], i32 [[TMP1]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[NEXT_GEP]] to <32 x i8>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[NEXT_GEP4]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_010:%.*]] = phi i8* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[A_ADDR_010]], align 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 2
+; CHECK-NEXT:    [[CONV1:%.*]] = add i8 [[TMP6]], [[TMP0]]
+; CHECK-NEXT:    store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !13
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i8
+  br label %for.body
+
+for.body:
+  %A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %1 = load i8, i8* %A.addr.010, align 1
+  %add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 2
+  %conv1 = add i8 %1, %0
+  store i8 %conv1, i8* %B.addr.08, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v16i8_add3(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v16i8_add3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_010:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY:%.*]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_09:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_08:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[A_ADDR_010]], align 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 3
+; CHECK-NEXT:    [[CONV1:%.*]] = add i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_09]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc i32 %y to i8
+  br label %for.body
+
+for.body:
+  %A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %1 = load i8, i8* %A.addr.010, align 1
+  %add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 3
+  %conv1 = add i8 %1, %0
+  store i8 %conv1, i8* %B.addr.08, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %inc = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4f32_add1(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
+; CHECK-LABEL: @pointer_phi_v4f32_add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr float, float* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[NEXT_GEP]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[NEXT_GEP4]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load float, float* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 1
+  %add = fadd fast float %0, %y
+  store float %add, float* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4f32_add2(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
+; CHECK-LABEL: @pointer_phi_v4f32_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[A:%.*]], i32 1992
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr float, float* [[B:%.*]], i32 996
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[A]], i32 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr float, float* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[NEXT_GEP]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[NEXT_GEP4]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !15
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP5]], [[Y]]
+; CHECK-NEXT:    store float [[ADD]], float* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !16
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load float, float* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 2
+  %add = fadd fast float %0, %y
+  store float %add, float* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4f32_add3(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
+; CHECK-LABEL: @pointer_phi_v4f32_add3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr float, float* [[A:%.*]], i32 2988
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr float, float* [[B:%.*]], i32 996
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi float* [ [[A]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr float, float* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, float* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr float, float* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[NEXT_GEP]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !17
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP5]], [[Y]]
+; CHECK-NEXT:    store float [[ADD]], float* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !18
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load float, float* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 3
+  %add = fadd fast float %0, %y
+  store float %add, float* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4half_add1(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
+; CHECK-LABEL: @pointer_phi_v4half_add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x half> [[BROADCAST_SPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr half, half* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr half, half* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[NEXT_GEP]] to <8 x half>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[NEXT_GEP4]] to <8 x half>*
+; CHECK-NEXT:    store <8 x half> [[TMP1]], <8 x half>* [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load half, half* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 1
+  %add = fadd fast half %0, %y
+  store half %add, half* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4half_add2(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
+; CHECK-LABEL: @pointer_phi_v4half_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr half, half* [[A:%.*]], i32 1984
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr half, half* [[B:%.*]], i32 992
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x half> undef, half [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x half> [[BROADCAST_SPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr half, half* [[A]], i32 [[TMP0]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr half, half* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[NEXT_GEP]] to <16 x half>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x half>, <16 x half>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x half> [[WIDE_VEC]], <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[NEXT_GEP4]] to <8 x half>*
+; CHECK-NEXT:    store <8 x half> [[TMP2]], <8 x half>* [[TMP3]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi half* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = load half, half* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds half, half* [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast half [[TMP5]], [[Y]]
+; CHECK-NEXT:    store half [[ADD]], half* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, half* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !21
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load half, half* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 2
+  %add = fadd fast half %0, %y
+  store half %add, half* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+define hidden void @pointer_phi_v4half_add3(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
+; CHECK-LABEL: @pointer_phi_v4half_add3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi half* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load half, half* [[A_ADDR_09]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds half, half* [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast half [[TMP0]], [[Y:%.*]]
+; CHECK-NEXT:    store half [[ADD]], half* [[B_ADDR_07]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, half* [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+for.body:
+  %A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %0 = load half, half* %A.addr.09, align 4
+  %add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 3
+  %add = fadd fast half %0, %y
+  store half %add, half* %B.addr.07, align 4
+  %incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+
+define hidden void @pointer_phi_v4i32_uf2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %n, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v4i32_uf2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 59952
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 9992
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i32, i32* [[POINTER_PHI]], i32 48
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992
+; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_08:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[Y]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop !23
+;
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %A.addr.08 = phi i32* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %0 = load i32, i32* %A.addr.08, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %A.addr.08, i32 6
+  %add = add nsw i32 %0, %y
+  store i32 %add, i32* %B.addr.06, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %B.addr.06, i32 1
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.interleave.count", i32 4}
+
+define hidden void @pointer_phi_v4i32_uf4(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %n, i32 %y) {
+; CHECK-LABEL: @pointer_phi_v4i32_uf4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 59904
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i32, i32* [[B:%.*]], i32 9984
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ [[A]], [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i32, i32* [[POINTER_PHI]], i32 96
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 48, i32 54, i32 60, i32 66>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, i32* [[POINTER_PHI]], <4 x i32> <i32 72, i32 78, i32 84, i32 90>
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT15]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, i32* [[NEXT_GEP]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984
+; CHECK-NEXT:    br i1 [[TMP16]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[A_ADDR_08:%.*]] = phi i32* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi i32* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* [[A_ADDR_08]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[Y]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop !25
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %A.addr.08 = phi i32* [ %A, %entry ], [ %add.ptr, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %0 = load i32, i32* %A.addr.08, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %A.addr.08, i32 6
+  %add = add nsw i32 %0, %y
+  store i32 %add, i32* %B.addr.06, align 4
+  %incdec.ptr = getelementptr inbounds i32, i32* %B.addr.06, i32 1
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !2
+}
+
+define hidden void @mult_ptr_iv(i8* noalias nocapture readonly %x, i8* noalias nocapture %z) {
+; CHECK-LABEL: @mult_ptr_iv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[Z:%.*]], i32 3000
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[X:%.*]], i32 3000
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[SCEVGEP1]], [[Z]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP]], [[X]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[X]], i32 3000
+; CHECK-NEXT:    [[IND_END3:%.*]] = getelementptr i8, i8* [[Z]], i32 3000
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i8* [ [[X]], [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI5:%.*]] = phi i8* [ [[Z]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0]] = getelementptr i8, i8* [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP2]] = getelementptr i8, i8* [[POINTER_PHI5]], i32 12
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 2
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP4]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP5]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP3]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP3]], i32 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP7]], <4 x i8*> [[TMP9]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP8]], <4 x i8*> [[TMP10]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP11]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !31
+; CHECK:       for.body:
+; CHECK-NEXT:    [[X_ADDR_050:%.*]] = phi i8* [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[Z_ADDR_049:%.*]] = phi i8* [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ]
+; CHECK-NEXT:    [[I_048:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, i8* [[X_ADDR_050]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR2]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[INCDEC_PTR1]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP12]], 10
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i8 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i8 [[TMP12]], [[TMP14]]
+; CHECK-NEXT:    [[INCDEC_PTR32:%.*]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 1
+; CHECK-NEXT:    store i8 [[MUL]], i8* [[Z_ADDR_049]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR33:%.*]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 2
+; CHECK-NEXT:    store i8 [[MUL1]], i8* [[INCDEC_PTR32]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR34]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 3
+; CHECK-NEXT:    store i8 [[MUL2]], i8* [[INCDEC_PTR33]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[I_048]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], !llvm.loop !32
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %x.addr.050 = phi i8* [ %incdec.ptr2, %for.body ], [ %x, %entry ]
+  %z.addr.049 = phi i8* [ %incdec.ptr34, %for.body ], [ %z, %entry ]
+  %i.048 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %x.addr.050, i32 1
+  %0 = load i8, i8* %x.addr.050, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %x.addr.050, i32 2
+  %1 = load i8, i8* %incdec.ptr, align 1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %x.addr.050, i32 3
+  %2 = load i8, i8* %incdec.ptr1, align 1
+  %conv = zext i8 %0 to i32
+  %mul = mul nuw nsw i32 %conv, 10
+  %conv1 = zext i8 %1 to i32
+  %conv2 = zext i8 %2 to i32
+  %mul1 = mul nuw nsw i32 %conv, %conv1
+  %mul2 = mul nuw nsw i32 %conv, %conv2
+  %conv3 = trunc i32 %mul to i8
+  %conv4 = trunc i32 %mul1 to i8
+  %conv5 = trunc i32 %mul2 to i8
+  %incdec.ptr32 = getelementptr inbounds i8, i8* %z.addr.049, i32 1
+  store i8 %conv3, i8* %z.addr.049, align 1
+  %incdec.ptr33 = getelementptr inbounds i8, i8* %z.addr.049, i32 2
+  store i8 %conv4, i8* %incdec.ptr32, align 1
+  %incdec.ptr34 = getelementptr inbounds i8, i8* %z.addr.049, i32 3
+  store i8 %conv5, i8* %incdec.ptr33, align 1
+  %inc = add nuw i32 %i.048, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}