[llvm] r274627 - [LV] Don't widen trivial induction variables

Wed Jul 6 07:27:00 PDT 2016

Author: mssimpso
Date: Wed Jul  6 09:26:59 2016
New Revision: 274627

URL: http://llvm.org/viewvc/llvm-project?rev=274627&view=rev
Log:
[LV] Don't widen trivial induction variables

We currently always vectorize induction variables. However, if an induction
variable is only used for counting loop iterations or computing addresses with
getelementptr instructions, we don't need to do this. Vectorizing these trivial
induction variables can create vector code that is difficult to simplify later
on. This is especially true when the unroll factor is greater than one, and we
create vector arithmetic when computing step vectors. With this patch, we check
if an induction variable is only used for counting iterations or computing
addresses, and if so, scalarize the arithmetic when computing step vectors
instead. This allows for greater simplification.

This patch addresses the suboptimal pointer arithmetic sequence seen in
PR27881.

Reference: https://llvm.org/bugs/show_bug.cgi?id=27881
Differential Revision: http://reviews.llvm.org/D21620

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll
    llvm/trunk/test/Transforms/LoopVectorize/induction.ll
    llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll
    llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=274627&r1=274626&r2=274627&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Jul  6 09:26:59 2016
@@ -308,10 +308,14 @@ public:
   // Perform the actual loop widening (vectorization).
   // MinimumBitWidths maps scalar integer values to the smallest bitwidth they
   // can be validly truncated to. The cost model has assumed this truncation
-  // will happen when vectorizing.
+  // will happen when vectorizing. VecValuesToIgnore contains scalar values
+  // that the cost model has chosen to ignore because they will not be
+  // vectorized.
   void vectorize(LoopVectorizationLegality *L,
-                 const MapVector<Instruction *, uint64_t> &MinimumBitWidths) {
+                 const MapVector<Instruction *, uint64_t> &MinimumBitWidths,
+                 SmallPtrSetImpl<const Value *> &VecValuesToIgnore) {
     MinBWs = &MinimumBitWidths;
+    ValuesNotWidened = &VecValuesToIgnore;
     Legal = L;
     // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop();
@@ -407,6 +411,13 @@ protected:
   /// to each vector element of Val. The sequence starts at StartIndex.
   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
 
+  /// Compute a step vector like the above function, but scalarize the
+  /// arithmetic instead. The results of the computation are inserted into a
+  /// new vector with VF elements. \p Val is the initial value, \p Step is the
+  /// size of the step, and \p StartIdx indicates the index of the increment
+  /// from which to start computing the steps.
+  Value *getScalarizedStepVector(Value *Val, int StartIdx, Value *Step);
+
   /// Create a vector induction phi node based on an existing scalar one. This
   /// currently only works for integer induction variables with a constant
   /// step. If \p TruncType is non-null, instead of widening the original IV,
@@ -582,6 +593,11 @@ protected:
   /// represented as. The vector equivalents of these values should be truncated
   /// to this type.
   const MapVector<Instruction *, uint64_t> *MinBWs;
+
+  /// A set of values that should not be widened. This is taken from
+  /// VecValuesToIgnore in the cost model.
+  SmallPtrSetImpl<const Value *> *ValuesNotWidened;
+
   LoopVectorizationLegality *Legal;
 
   // Record whether runtime checks are added.
@@ -2073,7 +2089,7 @@ struct LoopVectorize : public FunctionPa
       // If we decided that it is not legal to vectorize the loop, then
       // interleave it.
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, IC);
-      Unroller.vectorize(&LVL, CM.MinBWs);
+      Unroller.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
 
       emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
                              Twine("interleaved loop (interleaved count: ") +
@@ -2081,7 +2097,7 @@ struct LoopVectorize : public FunctionPa
     } else {
       // If we decided that it is *legal* to vectorize the loop, then do it.
       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, VF.Width, IC);
-      LB.vectorize(&LVL, CM.MinBWs);
+      LB.vectorize(&LVL, CM.MinBWs, CM.VecValuesToIgnore);
       ++LoopsVectorized;
 
       // Add metadata to disable runtime unrolling a scalar loop when there are
@@ -2201,7 +2217,8 @@ void InnerLoopVectorizer::widenIntInduct
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
-  if (VF > 1 && IV->getType() == Induction->getType() && Step)
+  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
+      !ValuesNotWidened->count(IV))
     return createVectorIntInductionPHI(ID, Entry, TruncType);
 
   // The scalar value to broadcast. This will be derived from the canonical
@@ -2231,6 +2248,15 @@ void InnerLoopVectorizer::widenIntInduct
     }
   }
 
+  // If an induction variable is only used for counting loop iterations or
+  // calculating addresses, it shouldn't be widened. Scalarize the step vector
+  // to give InstCombine a better chance of simplifying it.
+  if (VF > 1 && ValuesNotWidened->count(IV)) {
+    for (unsigned Part = 0; Part < UF; ++Part)
+      Entry[Part] = getScalarizedStepVector(ScalarIV, VF * Part, Step);
+    return;
+  }
+
   // Finally, splat the scalar induction variable, and build the necessary step
   // vectors.
   Value *Broadcasted = getBroadcastInstrs(ScalarIV);
@@ -2266,6 +2292,29 @@ Value *InnerLoopVectorizer::getStepVecto
   return Builder.CreateAdd(Val, Step, "induction");
 }
 
+Value *InnerLoopVectorizer::getScalarizedStepVector(Value *Val, int StartIdx,
+                                                    Value *Step) {
+
+  // We can't create a vector with less than two elements.
+  assert(VF > 1 && "VF should be greater than one");
+
+  // Get the value type and ensure it and the step have the same integer type.
+  Type *ValTy = Val->getType()->getScalarType();
+  assert(ValTy->isIntegerTy() && ValTy == Step->getType() &&
+         "Val and Step should have the same integer type");
+
+  // Compute the scalarized step vector. We perform scalar arithmetic and then
+  // insert the results into the step vector.
+  Value *StepVector = UndefValue::get(ToVectorTy(ValTy, VF));
+  for (unsigned I = 0; I < VF; ++I) {
+    auto *Mul = Builder.CreateMul(ConstantInt::get(ValTy, StartIdx + I), Step);
+    auto *Add = Builder.CreateAdd(Val, Mul);
+    StepVector = Builder.CreateInsertElement(StepVector, Add, I);
+  }
+
+  return StepVector;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
   auto *SE = PSE.getSE();
@@ -6445,8 +6494,8 @@ void LoopVectorizationCostModel::collect
     auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
 
     // Check that the PHI is only used by the induction increment (UpdateV) or
-    // by GEPs. Then check that UpdateV is only used by a compare instruction or
-    // the loop header PHI.
+    // by GEPs. Then check that UpdateV is only used by a compare instruction,
+    // the loop header PHI, or by GEPs.
     // FIXME: Need precise def-use analysis to determine if this instruction
     // variable will be vectorized.
     if (std::all_of(PN->user_begin(), PN->user_end(),
@@ -6455,7 +6504,8 @@ void LoopVectorizationCostModel::collect
                     }) &&
         std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
                     [&](const User *U) -> bool {
-                      return U == PN || isa<ICmpInst>(U);
+                      return U == PN || isa<ICmpInst>(U) ||
+                             isa<GetElementPtrInst>(U);
                     })) {
       VecValuesToIgnore.insert(PN);
       VecValuesToIgnore.insert(UpdateV);

Modified: llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll?rev=274627&r1=274626&r2=274627&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/gep_with_bitcast.ll Wed Jul  6 09:26:59 2016
@@ -12,11 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:1
 
 ; CHECK-LABEL: @foo
 ; CHECK: vector.body
-; CHECK:  %0 = phi
-; CHECK:  %2 = getelementptr inbounds double*, double** %in, i64 %0
-; CHECK:  %3 = bitcast double** %2 to <4 x i64>*
-; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
-; CHECK:  %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK:  %[[IV:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:  %[[v0:.+]] = getelementptr inbounds double*, double** %in, i64 %[[IV]]
+; CHECK:  %[[v1:.+]] = bitcast double** %[[v0]] to <4 x i64>*
+; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %[[v1]], align 8
+; CHECK:  icmp eq <4 x i64> %wide.load, zeroinitializer
 ; CHECK:  br i1
 
 define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {

Modified: llvm/trunk/test/Transforms/LoopVectorize/induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/induction.ll?rev=274627&r1=274626&r2=274627&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/induction.ll Wed Jul  6 09:26:59 2016
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -66,6 +67,137 @@ loopexit:
   ret void
 }
 
+; Make sure we don't create a vector induction phi node that is unused.
+; Scalarize the step vectors instead.
+;
+; for (int i = 0; i < n; ++i)
+;   sum += a[i];
+;
+; IND-LABEL: @scalarize_induction_variable_01(
+; IND:     vector.body:
+; IND:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND-NOT:   add i64 {{.*}}, 2
+; IND:       getelementptr inbounds i64, i64* %a, i64 %index
+;
+; UNROLL-LABEL: @scalarize_induction_variable_01(
+; UNROLL:     vector.body:
+; UNROLL:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NOT:   add i64 {{.*}}, 4
+; UNROLL:       %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
+; UNROLL:       getelementptr i64, i64* %[[g1]], i64 2
+
+define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
+  %0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %1 = load i64, i64* %0, align 8
+  %2 = add i64 %1, %sum
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3  = phi i64 [ %2, %for.body ]
+  ret i64 %3
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors.
+;
+; float s = 0;
+; for (int i ; 0; i < n; i += 8)
+;   s += (a[i] + b[i] + 1.0f);
+;
+; IND-LABEL: @scalarize_induction_variable_02(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %[[i0:.+]] = shl i64 %index, 3
+; IND:   %[[i1:.+]] = or i64 %[[i0]], 8
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_02(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %[[i0:.+]] = shl i64 %index, 3
+; UNROLL:   %[[i1:.+]] = or i64 %[[i0]], 8
+; UNROLL:   %[[i2:.+]] = or i64 %[[i0]], 16
+; UNROLL:   %[[i3:.+]] = or i64 %[[i0]], 24
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+
+define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %i
+  %1 = load float, float* %0, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %i
+  %3 = load float, float* %2, align 4
+  %4 = fadd fast float %s, 1.0
+  %5 = fadd fast float %4, %1
+  %6 = fadd fast float %5, %3
+  %i.next = add nuw nsw i64 %i, 8
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %s.lcssa = phi float [ %6, %for.body ]
+  ret float %s.lcssa
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   a[i].f ^= y;
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+
+%pair = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair *%p, i32 %y, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %f = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i32, i32* %f, align 8
+  %1 = xor i32 %0, %y
+  store i32 %1, i32* %f, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
 ; Make sure that the loop exit count computation does not overflow for i8 and
 ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
@@ -114,9 +246,11 @@ define i32 @i16_loop() nounwind readnone
 ; CHECK-LABEL: max_i32_backedgetaken
 ; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
 
+; CHECK: middle.block:
+; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
 ; CHECK: scalar.ph:
-; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ]
 
 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 

Modified: llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll?rev=274627&r1=274626&r2=274627&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/iv_outside_user.ll Wed Jul  6 09:26:59 2016
@@ -22,8 +22,8 @@ for.end:
 
 ; CHECK-LABEL: @preinc
 ; CHECK-LABEL: middle.block:
-; CHECK: %3 = sub i32 %n.vec, 1
-; CHECK: %ind.escape = add i32 0, %3
+; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1
+; CHECK: %ind.escape = add i32 0, %[[v3]]
 ; CHECK-LABEL: scalar.ph:
 ; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
 ; CHECK-LABEL: for.end:

Modified: llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll?rev=274627&r1=274626&r2=274627&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/reverse_induction.ll Wed Jul  6 09:26:59 2016
@@ -5,9 +5,24 @@ target datalayout = "e-p:64:64:64-i1:8:8
 ; Make sure consecutive vector generates correct negative indices.
 ; PR15882
 
-; CHECK-LABEL: @reverse_induction_i64(
-; CHECK: %step.add = add <4 x i64> %vec.ind, <i64 -4, i64 -4, i64 -4, i64 -4>
-; CHECK: %step.add2 = add <4 x i64> %step.add, <i64 -4, i64 -4, i64 -4, i64 -4>
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i64 %startval, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0
+; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
+; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1
+; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
+; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2
+; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
+; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0
+; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
+; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1
+; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
+; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2
+; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
+; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3
 
 define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
 entry:
@@ -30,8 +45,25 @@ loopend:
 }
 
 ; CHECK-LABEL: @reverse_induction_i128(
-; CHECK: %step.add = add <4 x i128> %vec.ind, <i128 -4, i128 -4, i128 -4, i128 -4>
-; CHECK: %step.add2 = add <4 x i128> %step.add, <i128 -4, i128 -4, i128 -4, i128 -4>
+; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i128 %startval, %index
+; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
+; CHECK: %[[v0:.+]] = insertelement <4 x i128> undef, i128 %[[a0]], i64 0
+; CHECK: %[[a1:.+]] = add i128 %offset.idx, -1
+; CHECK: %[[v1:.+]] = insertelement <4 x i128> %[[v0]], i128 %[[a1]], i64 1
+; CHECK: %[[a2:.+]] = add i128 %offset.idx, -2
+; CHECK: %[[v2:.+]] = insertelement <4 x i128> %[[v1]], i128 %[[a2]], i64 2
+; CHECK: %[[a3:.+]] = add i128 %offset.idx, -3
+; CHECK: %[[v3:.+]] = insertelement <4 x i128> %[[v2]], i128 %[[a3]], i64 3
+; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
+; CHECK: %[[v4:.+]] = insertelement <4 x i128> undef, i128 %[[a4]], i64 0
+; CHECK: %[[a5:.+]] = add i128 %offset.idx, -5
+; CHECK: %[[v5:.+]] = insertelement <4 x i128> %[[v4]], i128 %[[a5]], i64 1
+; CHECK: %[[a6:.+]] = add i128 %offset.idx, -6
+; CHECK: %[[v6:.+]] = insertelement <4 x i128> %[[v5]], i128 %[[a6]], i64 2
+; CHECK: %[[a7:.+]] = add i128 %offset.idx, -7
+; CHECK: %[[v7:.+]] = insertelement <4 x i128> %[[v6]], i128 %[[a7]], i64 3
+
 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
 entry:
   br label %for.body
@@ -53,8 +85,24 @@ loopend:
 }
 
 ; CHECK-LABEL: @reverse_induction_i16(
-; CHECK: add <4 x i16> %[[SPLAT:.*]], <i16 0, i16 -1, i16 -2, i16 -3>
-; CHECK: add <4 x i16> %[[SPLAT]], <i16 -4, i16 -5, i16 -6, i16 -7>
+; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i16 %startval, {{.*}}
+; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
+; CHECK: %[[v0:.+]] = insertelement <4 x i16> undef, i16 %[[a0]], i64 0
+; CHECK: %[[a1:.+]] = add i16 %offset.idx, -1
+; CHECK: %[[v1:.+]] = insertelement <4 x i16> %[[v0]], i16 %[[a1]], i64 1
+; CHECK: %[[a2:.+]] = add i16 %offset.idx, -2
+; CHECK: %[[v2:.+]] = insertelement <4 x i16> %[[v1]], i16 %[[a2]], i64 2
+; CHECK: %[[a3:.+]] = add i16 %offset.idx, -3
+; CHECK: %[[v3:.+]] = insertelement <4 x i16> %[[v2]], i16 %[[a3]], i64 3
+; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
+; CHECK: %[[v4:.+]] = insertelement <4 x i16> undef, i16 %[[a4]], i64 0
+; CHECK: %[[a5:.+]] = add i16 %offset.idx, -5
+; CHECK: %[[v5:.+]] = insertelement <4 x i16> %[[v4]], i16 %[[a5]], i64 1
+; CHECK: %[[a6:.+]] = add i16 %offset.idx, -6
+; CHECK: %[[v6:.+]] = insertelement <4 x i16> %[[v5]], i16 %[[a6]], i64 2
+; CHECK: %[[a7:.+]] = add i16 %offset.idx, -7
+; CHECK: %[[v7:.+]] = insertelement <4 x i16> %[[v6]], i16 %[[a7]], i64 3
 
 define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
 entry: