[llvm] r296145 - [LV] Merge floating-point and integer induction widening code

Fri Feb 24 10:20:12 PST 2017

Author: mssimpso
Date: Fri Feb 24 12:20:12 2017
New Revision: 296145

URL: http://llvm.org/viewvc/llvm-project?rev=296145&view=rev
Log:
[LV] Merge floating-point and integer induction widening code

This patch merges the existing floating-point induction variable widening code
into the integer induction variable widening code, creating a single set of
functions for both kinds of inductions. The primary motivation for doing this
is to enable vector phi node creation for floating-point induction variables.

Differential Revision: https://reviews.llvm.org/D30211

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=296145&r1=296144&r2=296145&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Fri Feb 24 12:20:12 2017
@@ -369,6 +369,23 @@ static bool hasIrregularType(Type *Ty, c
 ///       we always assume predicated blocks have a 50% chance of executing.
 static unsigned getReciprocalPredBlockProb() { return 2; }
 
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -522,19 +539,21 @@ protected:
   /// \p EntryVal is the value from the original loop that maps to the steps.
   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
   /// can be a truncate instruction).
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+                        const InductionDescriptor &ID);
 
   /// Create a vector induction phi node based on an existing scalar one. \p
   /// EntryVal is the value from the original loop that maps to the vector phi
   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
   /// truncate instruction, instead of widening the original IV, we widen a
   /// version of the IV truncated to \p EntryVal's type.
-  void createVectorIntInductionPHI(const InductionDescriptor &II, Value *Step,
-                                   Instruction *EntryVal);
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+                                       Value *Step, Instruction *EntryVal);
 
-  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
-  /// induction variable will first be truncated to the corresponding type.
-  void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
+  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -2324,30 +2343,46 @@ Value *InnerLoopVectorizer::getBroadcast
   return Shuf;
 }
 
-void InnerLoopVectorizer::createVectorIntInductionPHI(
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
   Value *Start = II.getStartValue();
-  assert(Step->getType()->isIntegerTy() &&
-         "Cannot widen an IV having a step with a non-integer type");
 
   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
   if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
     auto *TruncType = cast<IntegerType>(EntryVal->getType());
     Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }
   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
-  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
 
   // Create a vector splat to use in the induction update.
   //
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  auto *ConstVF = ConstantInt::getSigned(Step->getType(), VF);
-  auto *Mul = Builder.CreateMul(Step, ConstVF);
   Value *SplatVF = isa<Constant>(Mul)
                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                        : Builder.CreateVectorSplat(VF, Mul);
@@ -2361,8 +2396,8 @@ void InnerLoopVectorizer::createVectorIn
   VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part] = LastInduction;
-    LastInduction = cast<Instruction>(
-        Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
   VectorLoopValueMap.initVector(EntryVal, Entry);
   if (isa<TruncInst>(EntryVal))
@@ -2395,7 +2430,10 @@ bool InnerLoopVectorizer::needsScalarInd
   return any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
 
   auto II = Legal->getInductionVars()->find(IV);
   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -2424,15 +2462,20 @@ void InnerLoopVectorizer::widenIntInduct
   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
          "Induction step should be loop invariant");
   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-  Value *Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                                  LoopVectorPreHeader->getTerminator());
+  Value *Step = nullptr;
+  if (PSE.getSE()->isSCEVable(IV->getType())) {
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                             LoopVectorPreHeader->getTerminator());
+  } else {
+    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+  }
 
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntInductionPHI(ID, Step, EntryVal);
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
     VectorizedIV = true;
   }
 
@@ -2451,7 +2494,10 @@ void InnerLoopVectorizer::widenIntInduct
     } else {
       ScalarIV = Induction;
       if (IV != OldInduction) {
-        ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
+        ScalarIV = IV->getType()->isIntegerTy()
+                       ? Builder.CreateSExtOrTrunc(ScalarIV, IV->getType())
+                       : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                            IV->getType());
         ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
         ScalarIV->setName("offset.idx");
       }
@@ -2464,7 +2510,8 @@ void InnerLoopVectorizer::widenIntInduct
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+      Entry[Part] =
+          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
     VectorLoopValueMap.initVector(EntryVal, Entry);
     if (Trunc)
       addMetadata(Entry, Trunc);
@@ -2477,7 +2524,7 @@ void InnerLoopVectorizer::widenIntInduct
   // in the loop in the common case prior to InstCombine. We will be trading
   // one vector extract for each scalar step.
   if (NeedsScalarIV)
-    buildScalarSteps(ScalarIV, Step, EntryVal);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2537,15 +2584,28 @@ Value *InnerLoopVectorizer::getStepVecto
 }
 
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Value *EntryVal) {
+                                           Value *EntryVal,
+                                           const InductionDescriptor &ID) {
 
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
 
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
-  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
-         "Val and Step should have the same integer type");
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
 
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If EntryVal is uniform, we only need to generate the first
@@ -2558,9 +2618,9 @@ void InnerLoopVectorizer::buildScalarSte
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
-      auto *Mul = Builder.CreateMul(StartIdx, Step);
-      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
+      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       Entry[Part][Lane] = Add;
     }
   }
@@ -3643,16 +3703,6 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
-static Value *addFastMathFlag(Value *V) {
-  if (isa<FPMathOperator>(V)) {
-    FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
-    cast<Instruction>(V)->setFastMathFlags(Flags);
-  }
-  return V;
-}
-
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
@@ -4653,7 +4703,8 @@ void InnerLoopVectorizer::widenPHIInstru
   case InductionDescriptor::IK_NoInduction:
     llvm_unreachable("Unknown induction");
   case InductionDescriptor::IK_IntInduction:
-    return widenIntInduction(P);
+  case InductionDescriptor::IK_FpInduction:
+    return widenIntOrFpInduction(P);
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4680,30 +4731,6 @@ void InnerLoopVectorizer::widenPHIInstru
     VectorLoopValueMap.initScalar(P, Entry);
     return;
   }
-  case InductionDescriptor::IK_FpInduction: {
-    assert(P->getType() == II.getStartValue()->getType() &&
-           "Types must match");
-    // Handle other induction variables that are now based on the
-    // canonical one.
-    assert(P != OldInduction && "Primary induction can be integer only");
-
-    Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
-    V = II.transform(Builder, V, PSE.getSE(), DL);
-    V->setName("fp.offset.idx");
-
-    // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
-
-    Value *Broadcasted = getBroadcastInstrs(V);
-    // After broadcasting the induction variable we need to make the vector
-    // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
-    Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part)
-      Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
-                                  II.getInductionOpcode());
-    VectorLoopValueMap.initVector(P, Entry);
-    return;
-  }
   }
 }
 
@@ -4878,8 +4905,8 @@ void InnerLoopVectorizer::vectorizeBlock
       // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
       // (c) other casts depend on pointer size.
       if (Cost->isOptimizableIVTruncate(CI, VF)) {
-        widenIntInduction(cast<PHINode>(CI->getOperand(0)),
-                          cast<TruncInst>(CI));
+        widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                              cast<TruncInst>(CI));
         break;
       }
 

Modified: llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll?rev=296145&r1=296144&r2=296145&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/float-induction.ll Fri Feb 24 12:20:12 2017
@@ -15,47 +15,50 @@
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL1-NEXT:    br label %vector.body
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, [[TMP5]]
-; VEC4_INTERL1-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP6]]
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
-; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL1-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT4]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; VEC4_INTERL1-NEXT:    store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP9]], align 4
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4
 ; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]]
 ; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 ; VEC4_INTERL2-LABEL: @fp_iv_loop1(
 ; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    br label %vector.body
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, [[TMP5]]
-; VEC4_INTERL2-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP6]]
-; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT4]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:    [[INDUCTION6:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT5]], [[TMP7]]
-; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
-; VEC4_INTERL2-NEXT:    [[INDUCTION9:%.*]] = fsub fast <4 x float> [[BROADCAST_SPLAT5]], [[TMP8]]
+; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]]
 ; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; VEC4_INTERL2-NEXT:    store <4 x float> [[INDUCTION6]], <4 x float>* [[TMP10]], align 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
-; VEC4_INTERL2-NEXT:    store <4 x float> [[INDUCTION9]], <4 x float>* [[TMP12]], align 4
+; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]]
 ; VEC4_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 ; VEC1_INTERL2-LABEL: @fp_iv_loop1(
@@ -112,19 +115,18 @@ for.end:
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop2(
 ; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
 ; VEC4_INTERL1-NEXT:    br label %vector.body
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP5]], 5.000000e-01
-; VEC4_INTERL1-NEXT:    [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP6]], %init
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[INDUCTION5:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT4]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; VEC4_INTERL1-NEXT:    store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
 ; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
@@ -170,30 +172,27 @@ for.end:
 ; VEC4_INTERL1:       for.body.lr.ph:
 ; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
 ; VEC4_INTERL1:       vector.ph:
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT14]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP7]], -5.000000e-01
-; VEC4_INTERL1-NEXT:    [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP8]], 0x3FB99999A0000000
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL1-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP0]], [[TMP9]]
-; VEC4_INTERL1-NEXT:    [[FP_OFFSET_IDX10:%.*]] = fadd fast float [[TMP10]], %init
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX10]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT11]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
-; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[TMP11:%.*]] = fmul fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL1-NEXT:    [[INDUCTION13:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT12]], [[TMP11]]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; VEC4_INTERL1-NEXT:    store <4 x float> [[INDUCTION13]], <4 x float>* [[TMP13]], align 4
-; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[INDUCTION13]], [[BROADCAST_SPLAT15]]
-; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT7]], <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
 ; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
 ; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]]
 ; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
@@ -202,6 +201,8 @@ for.end:
 ; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
 ; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
 ; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
 ; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
@@ -252,16 +253,12 @@ for.end:
 ; VEC4_INTERL1-NEXT:    br label %vector.body
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP5]], 5.000000e-01
-; VEC4_INTERL1-NEXT:    [[FP_OFFSET_IDX:%.*]] = fadd fast float [[TMP6]], 1.000000e+00
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x float> undef, float [[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1-NEXT:    [[INDUCTION5:%.*]] = fadd fast <4 x float> [[BROADCAST_SPLAT4]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; VEC4_INTERL1-NEXT:    store <4 x float> [[INDUCTION5]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
 ; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
@@ -294,9 +291,6 @@ for.end:
 ; VEC2_INTERL1_PRED_STORE:       vector.body:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT3]], <2 x float> undef, <2 x i32> zeroinitializer
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDUCTION5:%.*]] = fadd fast <2 x float> [[BROADCAST_SPLAT4]], <float 0.000000e+00, float 1.000000e+00>
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
@@ -304,15 +298,14 @@ for.end:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF]]:
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[INDUCTION5]], i32 0
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
-; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP6]], float* [[TMP7]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP1]], float* [[TMP7]], align 4
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE]]:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
 ; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF6]]:
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[INDUCTION5]], i32 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP9]], float* [[TMP11]], align 4