[llvm] r277474 - [LV] Generate both scalar and vector integer induction variables

Tue Aug 2 08:25:16 PDT 2016

Author: mssimpso
Date: Tue Aug  2 10:25:16 2016
New Revision: 277474

URL: http://llvm.org/viewvc/llvm-project?rev=277474&view=rev
Log:
[LV] Generate both scalar and vector integer induction variables

This patch enables the vectorizer to generate both scalar and vector versions
of an integer induction variable for a given loop. Previously, we only
generated a scalar induction variable if we knew all its users were going to be
scalar. Otherwise, we generated a vector induction variable. In the case of a
loop with both scalar and vector users of the induction variable, we would
generate the vector induction variable and extract scalar values from it for
the scalar users. With this patch, we now generate both versions of the
induction variable when there are both scalar and vector users and select which
version to use based on whether the user is scalar or vector.

Differential Revision: https://reviews.llvm.org/D22869

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll
    llvm/trunk/test/Transforms/LoopVectorize/induction.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=277474&r1=277473&r2=277474&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Aug  2 10:25:16 2016
@@ -435,6 +435,9 @@ protected:
   void widenIntInduction(PHINode *IV, VectorParts &Entry,
                          TruncInst *Trunc = nullptr);
 
+  /// Returns true if we should generate a scalar version of \p IV.
+  bool needsScalarInduction(Instruction *IV) const;
+
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -1970,6 +1973,16 @@ void InnerLoopVectorizer::createVectorIn
   VecInd->addIncoming(LastInduction, LoopVectorLatch);
 }
 
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+  if (Legal->isScalarAfterVectorization(IV))
+    return true;
+  auto isScalarInst = [&](User *U) -> bool {
+    auto *I = cast<Instruction>(U);
+    return (OrigLoop->contains(I) && Legal->isScalarAfterVectorization(I));
+  };
+  return any_of(IV->users(), isScalarInst);
+}
+
 void InnerLoopVectorizer::widenIntInduction(PHINode *IV, VectorParts &Entry,
                                             TruncInst *Trunc) {
 
@@ -1982,9 +1995,25 @@ void InnerLoopVectorizer::widenIntInduct
   // If a truncate instruction was provided, get the smaller type.
   auto *TruncType = Trunc ? cast<IntegerType>(Trunc->getType()) : nullptr;
 
+  // The scalar value to broadcast. This will be derived from the canonical
+  // induction variable.
+  Value *ScalarIV = nullptr;
+
   // The step of the induction.
   Value *Step = nullptr;
 
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+  // True if we have vectorized the induction variable.
+  auto VectorizedIV = false;
+
+  // Determine if we want a scalar version of the induction variable. This is
+  // true if the induction variable itself is not widened, or if it has at
+  // least one user in the loop that is not widened.
+  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+
   // If the induction variable has a constant integer step value, go ahead and
   // get it now.
   if (ID.getConstIntStepValue())
@@ -1994,40 +2023,45 @@ void InnerLoopVectorizer::widenIntInduct
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !Legal->isScalarAfterVectorization(IV))
-    return createVectorIntInductionPHI(ID, Entry, TruncType);
-
-  // The scalar value to broadcast. This will be derived from the canonical
-  // induction variable.
-  Value *ScalarIV = nullptr;
-
-  // Define the scalar induction variable and step values. If we were given a
-  // truncation type, truncate the canonical induction variable and constant
-  // step. Otherwise, derive these values from the induction descriptor.
-  if (TruncType) {
-    assert(Step && "Truncation requires constant integer step");
-    auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
-    ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
-    Step = ConstantInt::getSigned(TruncType, StepInt);
-  } else {
-    ScalarIV = Induction;
-    auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-    if (IV != OldInduction) {
-      ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
-      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
-      ScalarIV->setName("offset.idx");
-    }
-    if (!Step) {
-      SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-      Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                               &*Builder.GetInsertPoint());
+      !Legal->isScalarAfterVectorization(EntryVal)) {
+    createVectorIntInductionPHI(ID, Entry, TruncType);
+    VectorizedIV = true;
+  }
+
+  // If we haven't yet vectorized the induction variable, or if we will create
+  // a scalar one, we need to define the scalar induction variable and step
+  // values. If we were given a truncation type, truncate the canonical
+  // induction variable and constant step. Otherwise, derive these values from
+  // the induction descriptor.
+  if (!VectorizedIV || NeedsScalarIV) {
+    if (TruncType) {
+      assert(Step && "Truncation requires constant integer step");
+      auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
+      ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
+      Step = ConstantInt::getSigned(TruncType, StepInt);
+    } else {
+      ScalarIV = Induction;
+      auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+      if (IV != OldInduction) {
+        ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
+        ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+        ScalarIV->setName("offset.idx");
+      }
+      if (!Step) {
+        SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+        Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                                 &*Builder.GetInsertPoint());
+      }
     }
   }
 
-  // Splat the scalar induction variable, and build the necessary step vectors.
-  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
-  for (unsigned Part = 0; Part < UF; ++Part)
-    Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+  // If we haven't yet vectorized the induction variable, splat the scalar
+  // induction variable, and build the necessary step vectors.
+  if (!VectorizedIV) {
+    Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+  }
 
   // If an induction variable is only used for counting loop iterations or
   // calculating addresses, it doesn't need to be widened. Create scalar steps
@@ -2035,10 +2069,8 @@ void InnerLoopVectorizer::widenIntInduct
   // addition of the scalar steps will not increase the number of instructions
   // in the loop in the common case prior to InstCombine. We will be trading
   // one vector extract for each scalar step.
-  if (VF > 1 && Legal->isScalarAfterVectorization(IV)) {
-    auto *EntryVal = Trunc ? cast<Value>(Trunc) : IV;
+  if (NeedsScalarIV)
     buildScalarSteps(ScalarIV, Step, EntryVal);
-  }
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll?rev=277474&r1=277473&r2=277474&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/scatter_crash.ll Tue Aug  2 10:25:16 2016
@@ -19,54 +19,56 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ 
 ; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ 
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 %index, 1
+; CHECK-NEXT:    %offset.idx = add i64 [[SHL]], 8
+; CHECK-NEXT:    [[IND00:%.*]] = add i64 %offset.idx, 0
+; CHECK-NEXT:    [[IND02:%.*]] = add i64 %offset.idx, 2
+; CHECK-NEXT:    [[IND04:%.*]] = add i64 %offset.idx, 4
+; CHECK-NEXT:    [[IND06:%.*]] = add i64 %offset.idx, 6
+; CHECK-NEXT:    [[IND08:%.*]] = add i64 %offset.idx, 8
+; CHECK-NEXT:    [[IND10:%.*]] = add i64 %offset.idx, 10
+; CHECK-NEXT:    [[IND12:%.*]] = add i64 %offset.idx, 12
+; CHECK-NEXT:    [[IND14:%.*]] = add i64 %offset.idx, 14
+; CHECK-NEXT:    [[IND16:%.*]] = add i64 %offset.idx, 16
+; CHECK-NEXT:    [[IND18:%.*]] = add i64 %offset.idx, 18
+; CHECK-NEXT:    [[IND20:%.*]] = add i64 %offset.idx, 20
+; CHECK-NEXT:    [[IND22:%.*]] = add i64 %offset.idx, 22
+; CHECK-NEXT:    [[IND24:%.*]] = add i64 %offset.idx, 24
+; CHECK-NEXT:    [[IND26:%.*]] = add i64 %offset.idx, 26
+; CHECK-NEXT:    [[IND28:%.*]] = add i64 %offset.idx, 28
+; CHECK-NEXT:    [[IND30:%.*]] = add i64 %offset.idx, 30
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 5
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 6
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 7
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 8
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 9
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 10
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 11
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP44]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 12
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 13
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP50]]
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 14
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP53]]
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <16 x i64> [[VEC_IND]], i32 15
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[TMP56]]
+; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15
 ; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <16 x [10 x i32]*> [[TMP58]], i32 0

Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=277474&r1=277473&r2=277474&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Tue Aug  2 10:25:16 2016
@@ -220,23 +220,23 @@ for.end:
 ; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
 ; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
 ; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
 
-%pair = type { i32, i32 }
-define void @scalarize_induction_variable_03(%pair *%p, i32 %y, i64 %n) {
+%pair.i32 = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
 entry:
   br label %for.body
 
 for.body:
   %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-  %f = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %f = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
   %0 = load i32, i32* %f, align 8
   %1 = xor i32 %0, %y
   store i32 %1, i32* %f, align 8
@@ -264,16 +264,16 @@ for.end:
 ; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
 ; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
 ; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
-; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
 
-define void @scalarize_induction_variable_04(i32* %a, %pair* %p, i32 %n) {
+define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
 entry:
   br label %for.body
 
@@ -282,7 +282,7 @@ for.body:
   %0 = shl nsw i64 %i, 2
   %1 = getelementptr inbounds i32, i32* %a, i64 %0
   %2 = load i32, i32* %1, align 1
-  %3 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %3 = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
   store i32 %2, i32* %3, align 1
   %i.next = add nuw nsw i64 %i, 1
   %4 = trunc i64 %i.next to i32
@@ -293,6 +293,70 @@ for.end:
   ret void
 }
 
+; Ensure we generate both a vector and a scalar induction variable. In this
+; test, the induction variable is used by an instruction that will be
+; vectorized (trunc) as well as an instruction that will remain in scalar form
+; (gepelementptr).
+;
+; CHECK-LABEL: @iv_vector_and_scalar_users(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   %[[i1:.+]] = add i64 %index, 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i0]], i32 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; CHECK:   %index.next = add i64 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i64> %vec.ind, <i64 2, i64 2>
+; CHECK:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; IND-LABEL: @iv_vector_and_scalar_users(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; IND:   %[[i1:.+]] = or i64 %index, 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; IND:   %index.next = add i64 %index, 2
+; IND:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; UNROLL-LABEL: @iv_vector_and_scalar_users(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind2 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next5, %vector.body ]
+; UNROLL:   %[[i1:.+]] = or i64 %index, 1
+; UNROLL:   %[[i2:.+]] = or i64 %index, 2
+; UNROLL:   %[[i3:.+]] = or i64 %index, 3
+; UNROLL:   %step.add3 = add <2 x i32> %vec.ind2, <i32 2, i32 2>
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i2]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i3]], i32 1
+; UNROLL:   %index.next = add i64 %index, 4
+; UNROLL:   %vec.ind.next5 = add <2 x i32> %vec.ind2, <i32 4, i32 4>
+
+%pair.i16 = type { i16, i16 }
+define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %0 = trunc i64 %i to i32
+  %1 = add i32 %a, %0
+  %2 = trunc i32 %1 to i16
+  %3 = getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %i, i32 1
+  store i16 %2, i16* %3, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %4 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %4, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; Make sure that the loop exit count computation does not overflow for i8 and
 ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
 ; induction variable to a bigger type the exit count computation will overflow
@@ -537,31 +601,66 @@ exit:
   ret void
 }
 
-; IND-LABEL: nonprimary
-; IND-LABEL: vector.ph
-; IND: %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
-; IND: %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; IND: %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 42>
-; IND-LABEL: vector.body:
-; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; IND: %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; IND: %index.next = add i32 %index, 2
-; IND: %vec.ind.next = add <2 x i32> %vec.ind, <i32 84, i32 84>
-; IND: %[[CMP:.*]] = icmp eq i32 %index.next
-; IND: br i1 %[[CMP]]
-; UNROLL-LABEL: nonprimary
-; UNROLL-LABEL: vector.ph
-; UNROLL: %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
-; UNROLL: %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
-; UNROLL: %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 42>
-; UNROLL-LABEL: vector.body:
-; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL: %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
-; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 84, i32 84>
-; UNROLL: %index.next = add i32 %index, 4
-; UNROLL: %vec.ind.next = add <2 x i32> %vec.ind, <i32 168, i32 168>
-; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
-; UNROLL: br i1 %[[CMP]]
+; CHECK-LABEL: @nonprimary(
+; CHECK: vector.ph:
+; CHECK:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; CHECK:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; CHECK: vector.body:
+; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %offset.idx = add i32 %i, %index
+; CHECK:   %[[A1:.*]] = add i32 %offset.idx, 0
+; CHECK:   %[[A2:.*]] = add i32 %offset.idx, 1
+; CHECK:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
+; CHECK:   %[[G2:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A2]]
+; CHECK:   %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
+; CHECK:   %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
+; CHECK:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; CHECK:   %index.next = add i32 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; CHECK:   br i1 %[[CMP]]
+;
+; IND-LABEL: @nonprimary(
+; IND: vector.ph:
+; IND:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; IND:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; IND:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; IND: vector.body:
+; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; IND:   %[[A1:.*]] = add i32 %index, %i
+; IND:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; IND:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; IND:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; IND:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; IND:   %index.next = add i32 %index, 2
+; IND:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; IND:   br i1 %[[CMP]]
+;
+; UNROLL-LABEL: @nonprimary(
+; UNROLL: vector.ph:
+; UNROLL:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; UNROLL:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; UNROLL:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; UNROLL:   %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL:   %[[A1:.*]] = add i32 %index, %i
+; UNROLL:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; UNROLL:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; UNROLL:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; UNROLL:   %[[G2:.*]] = getelementptr i32, i32* %[[G1]], i64 2
+; UNROLL:   %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %step.add, <2 x i32>* %[[B2]]
+; UNROLL:   %index.next = add i32 %index, 4
+; UNROLL:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; UNROLL:   br i1 %[[CMP]]
 define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
 for.body.preheader:
   br label %for.body
@@ -570,7 +669,7 @@ for.body:
   %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ %i, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
   store i32 %indvars.iv, i32* %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 42
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
   %exitcond = icmp eq i32 %indvars.iv.next, %k
   br i1 %exitcond, label %exit, label %for.body