[llvm] r266363 - [ARM] Adding IEEE-754 SIMD detection to loop vectorizer

Thu Apr 14 13:42:19 PDT 2016

Author: rengolin
Date: Thu Apr 14 15:42:18 2016
New Revision: 266363

URL: http://llvm.org/viewvc/llvm-project?rev=266363&view=rev
Log:
[ARM] Adding IEEE-754 SIMD detection to loop vectorizer

Some SIMD implementations are not IEEE-754 compliant, for example ARM's NEON.

This patch teaches the loop vectorizer to only allow transformations of loops
that either contain no floating-point operations or have enough allowance
flags supporting lack of precision (ex. -ffast-math, Darwin).

For that, the target description now has a method which tells us if the
vectorizer is allowed to handle FP math without falling into unsafe
representations, plus a check on every FP instruction in the candidate loop
to check for the safety flags.

This commit makes LLVM behave like GCC with respect to ARM NEON support, but
it stops short of fixing the underlying problem: sub-normals. Neither GCC
nor LLVM have a flag for allowing sub-normal operations. Before this patch,
GCC only allows it using unsafe-math flags and LLVM allows it by default with
no way to turn it off (short of not using NEON at all).

As a first step, we push this change to make it safe and in sync with GCC.
The second step is to discuss a new sub-normal's flag on both communitues
and come up with a common solution. The third step is to improve the FastMath
flags in LLVM to encode sub-normals and use those flags to restrict NEON FP.

Fixes PR16275.

Added:
    llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
Modified:
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=266363&r1=266362&r2=266363&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Thu Apr 14 15:42:18 2016
@@ -367,6 +367,15 @@ public:
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
+  /// \brief Indicate that it is potentially unsafe to automatically vectorize
+  /// floating-point operations because the semantics of vector and scalar
+  /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
+  /// does not support IEEE-754 denormal numbers, while depending on the
+  /// platform, scalar floating-point math does.
+  /// This applies to floating-point math operations and calls, not memory
+  /// operations, shuffles, or casts.
+  bool isFPVectorizationPotentiallyUnsafe() const;
+
   /// \brief Return hardware support for population count.
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
@@ -621,6 +630,7 @@ public:
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
+  virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
   virtual bool haveFastSqrt(Type *Ty) = 0;
   virtual int getFPOpCost(Type *Ty) = 0;
@@ -779,6 +789,9 @@ public:
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool isFPVectorizationPotentiallyUnsafe() override {
+    return Impl.isFPVectorizationPotentiallyUnsafe();
+  }
   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override {
     return Impl.getPopcntSupport(IntTyWidthInBit);
   }

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h?rev=266363&r1=266362&r2=266363&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h Thu Apr 14 15:42:18 2016
@@ -240,6 +240,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return false; }
 
+  bool isFPVectorizationPotentiallyUnsafe() { return false; }
+
   TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) {
     return TTI::PSK_Software;
   }

Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=266363&r1=266362&r2=266363&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Thu Apr 14 15:42:18 2016
@@ -172,6 +172,10 @@ bool TargetTransformInfo::enableInterlea
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
+  return TTIImpl->isFPVectorizationPotentiallyUnsafe();
+}
+
 TargetTransformInfo::PopcntSupportKind
 TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return TTIImpl->getPopcntSupport(IntTyWidthInBit);

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h?rev=266363&r1=266362&r2=266363&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h Thu Apr 14 15:42:18 2016
@@ -54,6 +54,10 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool isFPVectorizationPotentiallyUnsafe() {
+    return !ST->hasFPARMv8() && !ST->isTargetDarwin();
+  }
+
   /// \name Scalar TTI Implementations
   /// @{
 

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=266363&r1=266362&r2=266363&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Apr 14 15:42:18 2016
@@ -963,6 +963,9 @@ class LoopVectorizeHints {
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
+  /// True if there is any unsafe math in the loop.
+  bool PotentiallyUnsafe;
+
 public:
   enum ForceKind {
     FK_Undefined = -1, ///< Not selected.
@@ -975,7 +978,7 @@ public:
               HK_WIDTH),
         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
         Force("vectorize.enable", FK_Undefined, HK_FORCE),
-        TheLoop(L) {
+        PotentiallyUnsafe(false), TheLoop(L) {
     // Populate values with existing loop metadata.
     getHintsFromMetadata();
 
@@ -1073,6 +1076,19 @@ public:
     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
   }
 
+  bool isPotentiallyUnsafe() const {
+    // Avoid FP vectorization if the target is unsure about proper support.
+    // This may be related to the SIMD unit in the target not handling
+    // IEEE 754 FP ops properly, or bad single-to-double promotions.
+    // Otherwise, a sequence of vectorized loops, even without reduction,
+    // could lead to different end results on the destination vectors.
+    return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
+  }
+
+  void setPotentiallyUnsafe() {
+    PotentiallyUnsafe = true;
+  }
+
 private:
   /// Find hints specified in the loop metadata and update local values.
   void getHintsFromMetadata() {
@@ -1234,7 +1250,7 @@ public:
                             const TargetTransformInfo *TTI,
                             LoopAccessAnalysis *LAA,
                             LoopVectorizationRequirements *R,
-                            const LoopVectorizeHints *H)
+                            LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
         TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
         Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
@@ -1460,7 +1476,7 @@ private:
   LoopVectorizationRequirements *Requirements;
 
   /// Used to emit an analysis of any legality issues.
-  const LoopVectorizeHints *Hints;
+  LoopVectorizeHints *Hints;
 
   ValueToValueMap Strides;
   SmallPtrSet<Value *, 8> StrideSet;
@@ -1884,6 +1900,21 @@ struct LoopVectorize : public FunctionPa
       return false;
     }
 
+    // Check if the target supports potentially unsafe FP vectorization.
+    // FIXME: Add a check for the type of safety issue (denormal, signaling)
+    // for the target we're vectorizing for, to make sure none of the
+    // additional fp-math flags can help.
+    if (Hints.isPotentiallyUnsafe() &&
+        TTI->isFPVectorizationPotentiallyUnsafe()) {
+      DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
+      emitAnalysisDiag(
+          F, L, Hints,
+          VectorizationReport()
+             << "loop not vectorized due to unsafe FP support.");
+      emitMissedWarning(F, L, Hints);
+      return false;
+    }
+
     // Select the optimal vectorization factor.
     const LoopVectorizationCostModel::VectorizationFactor VF =
         CM.selectVectorizationFactor(OptForSize);
@@ -4695,12 +4726,23 @@ bool LoopVectorizationLegality::canVecto
         }
         if (EnableMemAccessVersioning)
           collectStridedAccess(ST);
-      }
 
-      if (EnableMemAccessVersioning)
-        if (LoadInst *LI = dyn_cast<LoadInst>(it))
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(it)) {
+        if (EnableMemAccessVersioning)
           collectStridedAccess(LI);
 
+      // FP instructions can allow unsafe algebra, thus vectorizable by
+      // non-IEEE-754 compliant SIMD units.
+      // This applies to floating-point math operations and calls, not memory
+      // operations, shuffles, or casts, as they don't change precision or
+      // semantics.
+      } else if (it->getType()->isFloatingPointTy() &&
+                (CI || it->isBinaryOp()) &&
+                !it->hasUnsafeAlgebra()) {
+        DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+        Hints->setPotentiallyUnsafe();
+      }
+
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
       if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {

Added: llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll?rev=266363&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll Thu Apr 14 15:42:18 2016
@@ -0,0 +1,335 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V7
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX-V8
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "sumf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "sumf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX-V7: Checking a loop in "redf"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "redf"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %add = fadd float %Red.06, %mul
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX-V7: Checking a loop in "fabs"
+; LINUX-V7: Potentially unsafe FP op prevents vectorization
+; LINUX-V8: Checking a loop in "fabs"
+; LINUX-V8: We can vectorize this loop!
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call float @fabsf(float %1) #1
+  %conv3 = fmul float %0, %fabsf
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %add = fadd fast float %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call fast float @fabsf(float %1) #2
+  %conv3 = fmul fast float %fabsf, %0
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }