[llvm] 9f76a85 - [LoopVectorize] Enable strict reductions when allowReordering() returns false

Wed May 26 06:06:18 PDT 2021

Author: Kerry McLaughlin
Date: 2021-05-26T13:59:12+01:00
New Revision: 9f76a8526010015fc3e5046fb2c5925000ac45a4

URL: https://github.com/llvm/llvm-project/commit/9f76a8526010015fc3e5046fb2c5925000ac45a4
DIFF: https://github.com/llvm/llvm-project/commit/9f76a8526010015fc3e5046fb2c5925000ac45a4.diff

LOG: [LoopVectorize] Enable strict reductions when allowReordering() returns false

When loop hints are passed via metadata, the allowReordering function
in LoopVectorizationLegality will allow the order of floating point
operations to be changed:

  bool allowReordering() const {
    // When enabling loop hints are provided we allow the vectorizer to change
    // the order of operations that is given by the scalar loop. This is not
    // enabled by default because can be unsafe or inefficient.

The -enable-strict-reductions flag introduced in D98435 will currently only
vectorize reductions in-loop if hints are used, since canVectorizeFPMath()
will return false if reordering is not allowed.

This patch changes canVectorizeFPMath() to query whether it is safe to
vectorize the loop with ordered reductions if no hints are used. For
testing purposes, an additional flag (-hints-allow-reordering) has been
added to disable the reordering behaviour described above.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D101836

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
    llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
    llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a66925ef671aa..4b22cdea5d807 100644

--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -159,16 +159,12 @@ class LoopVectorizeHints {
   /// pass name to force the frontend to print the diagnostic.
   const char *vectorizeAnalysisPassName() const;
 
-  bool allowReordering() const {
-    // When enabling loop hints are provided we allow the vectorizer to change
-    // the order of operations that is given by the scalar loop. This is not
-    // enabled by default because can be unsafe or inefficient. For example,
-    // reordering floating-point operations will change the way round-off
-    // error accumulates in the loop.
-    ElementCount EC = getWidth();
-    return getForce() == LoopVectorizeHints::FK_Enabled ||
-           EC.getKnownMinValue() > 1;
-  }
+  /// When enabling loop hints are provided we allow the vectorizer to change
+  /// the order of operations that is given by the scalar loop. This is not
+  /// enabled by default because can be unsafe or inefficient. For example,
+  /// reordering floating-point operations will change the way round-off
+  /// error accumulates in the loop.
+  bool allowReordering() const;
 
   bool isPotentiallyUnsafe() const {
     // Avoid FP vectorization if the target is unsure about proper support.
@@ -219,9 +215,6 @@ class LoopVectorizationRequirements {
 
 
   Instruction *getExactFPInst() { return ExactFPMathInst; }
-  bool canVectorizeFPMath(const LoopVectorizeHints &Hints) const {
-    return !ExactFPMathInst || Hints.allowReordering();
-  }
 
   unsigned getNumRuntimePointerChecks() const {
     return NumRuntimePointerChecks;
@@ -279,6 +272,11 @@ class LoopVectorizationLegality {
   /// If false, good old LV code.
   bool canVectorize(bool UseVPlanNativePath);
 
+  /// Returns true if it is legal to vectorize the FP math operations in this
+  /// loop. Vectorizing is legal if we allow reordering of FP operations, or if
+  /// we can use in-order reductions.
+  bool canVectorizeFPMath(bool EnableStrictReductions);
+
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking, and mark all respective loads/stores for masking.
   /// This object's state is only modified iff this function returns true.

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 59b2aa9c4a9c4..21f2378bddb93 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -37,6 +37,13 @@ static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                        cl::desc("Enable if-conversion during vectorization."));
 
+namespace llvm {
+cl::opt<bool>
+    HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden,
+                         cl::desc("Allow enabling loop hints to reorder "
+                                  "FP operations during vectorization."));
+}
+
 // TODO: Move size-based thresholds out of legality checking, make cost based
 // decisions instead of hard thresholds.
 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
@@ -211,6 +218,15 @@ const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
   return OptimizationRemarkAnalysis::AlwaysPrint;
 }
 
+bool LoopVectorizeHints::allowReordering() const {
+  // Allow the vectorizer to change the order of operations if enabling
+  // loop hints are provided
+  ElementCount EC = getWidth();
+  return HintsAllowReordering &&
+         (getForce() == LoopVectorizeHints::FK_Enabled ||
+          EC.getKnownMinValue() > 1);
+}
+
 void LoopVectorizeHints::getHintsFromMetadata() {
   MDNode *LoopID = TheLoop->getLoopID();
   if (!LoopID)
@@ -884,6 +900,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   return true;
 }
 
+bool LoopVectorizationLegality::canVectorizeFPMath(
+    bool EnableStrictReductions) {
+
+  // First check if there is any ExactFP math or if we allow reassociations
+  if (!Requirements->getExactFPInst() || Hints->allowReordering())
+    return true;
+
+  // If the above is false, we have ExactFPMath & do not allow reordering.
+  // If the EnableStrictReductions flag is set, first check if we have any
+  // Exact FP induction vars, which we cannot vectorize.
+  if (!EnableStrictReductions ||
+      any_of(getInductionVars(), [&](auto &Induction) -> bool {
+        InductionDescriptor IndDesc = Induction.second;
+        return IndDesc.getExactFPMathInst();
+      }))
+    return false;
+
+  // We can now only vectorize if all reductions with Exact FP math also
+  // have the isOrdered flag set, which indicates that we can move the
+  // reduction operations in-loop.
+  return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
+    RecurrenceDescriptor RdxDesc = Reduction.second;
+    return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
+  }));
+}
+
 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
   Value *In0 = const_cast<Value *>(V);
   PHINode *PN = dyn_cast_or_null<PHINode>(In0);

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fbdcf117b804b..49624cc1a1ba4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -331,6 +331,10 @@ static cl::opt<bool>
                            cl::desc("Prefer in-loop vector reductions, "
                                     "overriding the targets preference."));
 
+// FIXME: When loop hints are passed which allow reordering of FP operations,
+// we still choose to use strict reductions with this flag. We should instead
+// use the default behaviour of vectorizing with unordered reductions if
+// reordering is allowed.
 cl::opt<bool> EnableStrictReductions(
     "enable-strict-reductions", cl::init(false), cl::Hidden,
     cl::desc("Enable the vectorisation of loops with in-order (strict) "
@@ -9956,7 +9960,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (!Requirements.canVectorizeFPMath(Hints)) {
+  if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
     ORE->emit([&]() {
       auto *ExactFPMathInst = Requirements.getExactFPInst();
       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index ec139e0eeca9d..eafa093a99743 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
-; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
 
 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_strict
@@ -26,6 +27,9 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RES]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -86,6 +90,10 @@ define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-UNORDERED: for.end
 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -166,6 +174,9 @@ define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float*
 ; CHECK-UNORDERED: store float %[[RDX2]], float* {{.*}}
 ; CHECK-UNORDERED: ret void
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   %arrayidxa = getelementptr inbounds float, float* %a, i64 1
   %a1 = load float, float* %a, align 4
@@ -228,6 +239,9 @@ define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias n
 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
 ; CHECK-UNORDERED: ret float %[[SUM]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   %arrayidx = getelementptr inbounds float, float* %a, i64 1
   %0 = load float, float* %arrayidx, align 4
@@ -295,6 +309,9 @@ define float @fadd_conditional(float* noalias nocapture readonly %a, float* noal
 ; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RDX_PHI]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -324,27 +341,9 @@ for.end:
 }
 
 ; Negative test - loop contains multiple fadds which we cannot safely reorder
-; Note: This test vectorizes the loop with a non-strict implementation, which reorders the FAdd operations.
-; This is happening because we are using hints, where allowReordering returns true.
 define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_multiple
-; CHECK-ORDERED: vector.body
-; CHECK-ORDERED: %[[PHI:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> undef, float -0.000000e+00, i32 0), <vscale x 8 x float> undef, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
-; CHECK-ORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
-; CHECK-ORDERED: %[[VEC_FADD1:.*]] = fadd <vscale x 8 x float> %[[PHI]], %[[VEC_LOAD1]]
-; CHECK-ORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
-; CHECK-ORDERED: %[[VEC_FADD2]] = fadd <vscale x 8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
-; CHECK-ORDERED: middle.block
-; CHECK-ORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[VEC_FADD2]])
-; CHECK-ORDERED: for.body
-; CHECK-ORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
-; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[SUM]], %[[LOAD1]]
-; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
-; CHECK-ORDERED: for.end
-; CHECK-ORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
-; CHECK-ORDERED: ret float %[[RET]]
+; CHECK-ORDERED-NOT: vector.body
 
 ; CHECK-UNORDERED-LABEL: @fadd_multiple
 ; CHECK-UNORDERED: vector.body
@@ -364,6 +363,10 @@ define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocaptur
 ; CHECK-UNORDERED: for.end
 ; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index bf9cf761981a6..6e82cefa4d47b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
-; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true  -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
 
 define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_strict
@@ -26,6 +27,9 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RES]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -87,6 +91,9 @@ define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RES]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -168,6 +175,9 @@ define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a,
 ; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
 ; CHECK-UNORDERED: ret float %[[SUM_LCSSA]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   %cmp = icmp sgt i64 %n, 0
   br i1 %cmp, label %for.body, label %for.end
@@ -241,6 +251,9 @@ define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float*
 ; CHECK-UNORDERED: store float %[[SUM2]]
 ; CHECK-UNORDERED: ret void
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   %arrayidxa = getelementptr inbounds float, float* %a, i64 1
   %a1 = load float, float* %a, align 4
@@ -303,6 +316,9 @@ define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias n
 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
 ; CHECK-UNORDERED: ret float %[[SUM]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   %arrayidx = getelementptr inbounds float, float* %a, i64 1
   %0 = load float, float* %arrayidx, align 4
@@ -383,6 +399,9 @@ define float @fadd_conditional(float* noalias nocapture readonly %a, float* noal
 ; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RDX_PHI]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -449,6 +468,9 @@ define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
 ; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[SUM]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -468,27 +490,9 @@ for.end:                                            ; preds = %for.body
 }
 
 ; Negative test - loop contains multiple fadds which we cannot safely reorder
-; Note: This test vectorizes the loop with a non-strict implementation, which reorders the FAdd operations.
-; This is happening because we are using hints, where allowReordering returns true.
 define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
 ; CHECK-ORDERED-LABEL: @fadd_multiple
-; CHECK-ORDERED: vector.body
-; CHECK-ORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
-; CHECK-ORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
-; CHECK-ORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
-; CHECK-ORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
-; CHECK-ORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
-; CHECK-ORDERED: middle.block
-; CHECK-ORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
-; CHECK-ORDERED: for.body
-; CHECK-ORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
-; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
-; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
-; CHECK-ORDERED: for.end
-; CHECK-ORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
-; CHECK-ORDERED: ret float %[[RET]]
+; CHECK-ORDERED-NOT: vector.body
 
 ; CHECK-UNORDERED-LABEL: @fadd_multiple
 ; CHECK-UNORDERED: vector.body
@@ -509,6 +513,9 @@ define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocaptur
 ; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
 ; CHECK-UNORDERED: ret float %[[RET]]
 
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
 entry:
   br label %for.body
 
@@ -530,6 +537,143 @@ for.end:                                         ; preds = %for.body
   ret float %rdx
 }
 
+; Tests with both a floating point reduction & induction, e.g.
+;
+;float fp_iv_rdx_loop(float *values, float init, float * __restrict__ A, int N) {
+;  float fp_inc = 2.0;
+;  float x = init;
+;  float sum = 0.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += fp_inc;
+;    sum += values[i];
+;  }
+;  return sum;
+;}
+;
+; Note: These tests do not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
+; with the -hints-allow-reordering flag set to true.
+
+; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported
+define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
+; CHECK-ORDERED-LABEL: @induction_and_reduction
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @induction_and_reduction
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
+  %x.014 = phi float [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %iv
+  store float %x.014, float* %arrayidx, align 4
+  %add = fadd float %x.014, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv
+  %0 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %sum.015, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %add3
+}
+
+; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions
+define float @fast_induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
+; CHECK-ORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-ORDERED: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ]
+; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.015 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
+  %x.014 = phi fast float [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %iv
+  store float %x.014, float* %arrayidx, align 4
+  %add = fadd fast float %x.014, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv
+  %0 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %sum.015, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %add3
+}
+
+; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop
+define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {
+
+; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum2.023 = phi float [ 3.000000e+00, %entry ], [ %mul, %for.body ]
+  %sum.022 = phi float [ 0.000000e+00, %entry ], [ %add3, %for.body ]
+  %x.021 = phi float [ %init, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %iv
+  store float %x.021, float* %arrayidx, align 4
+  %add = fadd fast float %x.021, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %values, i64 %iv
+  %0 = load float, float* %arrayidx2, align 4
+  %add3 = fadd float %sum.022, %0
+  %mul = fmul float %sum2.023, %0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %add6 = fadd float %add3, %mul
+  ret float %add6
+}
+
 !0 = distinct !{!0, !4, !7, !9}
 !1 = distinct !{!1, !4, !8, !9}
 !2 = distinct !{!2, !5, !7, !9}