[llvm-branch-commits] [llvm] 5b77ac3 - [SLP] match maxnum/minnum intrinsics as FP reduction ops

Mon Jan 18 14:41:30 PST 2021

Author: Sanjay Patel
Date: 2021-01-18T17:37:16-05:00
New Revision: 5b77ac32b1150d066b35b45d6d982f4b4a1f62ff

URL: https://github.com/llvm/llvm-project/commit/5b77ac32b1150d066b35b45d6d982f4b4a1f62ff
DIFF: https://github.com/llvm/llvm-project/commit/5b77ac32b1150d066b35b45d6d982f4b4a1f62ff.diff

LOG: [SLP] match maxnum/minnum intrinsics as FP reduction ops

After much refactoring over the last 2 weeks to the reduction
matching code, I think this change is finally ready.

We effectively broke fmax/fmin vector reduction optimization
when we started canonicalizing to intrinsics in instcombine,
so this should restore that functionality for SLP.

There are still FMF problems here as noted in the code comments,
but we should be avoiding miscompiles on those for fmax/fmin by
restricting to full 'fast' ops (negative tests are included).

Fixing FMF propagation is a planned follow-up.

Differential Revision: https://reviews.llvm.org/D94913

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
    llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
    llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0323e02d0d2c..0fee52dcdd93 100644

--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6455,6 +6455,10 @@ class HorizontalReduction {
       case RecurKind::FMul:
         return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                    Name);
+      case RecurKind::FMax:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+      case RecurKind::FMin:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
 
       case RecurKind::SMax: {
         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
@@ -6568,6 +6572,15 @@ class HorizontalReduction {
       if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
         return true;
 
+      if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+        // FP min/max are associative except for NaN and -0.0. We do not
+        // have to rule out -0.0 here because the intrinsic semantics do not
+        // specify a fixed result for it.
+        // TODO: This is artificially restricted to fast because the code that
+        //       creates reductions assumes/produces fast ops.
+        return I->getFastMathFlags().isFast();
+      }
+
       return I->isAssociative();
     }
 
@@ -6677,6 +6690,11 @@ class HorizontalReduction {
     if (match(I, m_FMul(m_Value(), m_Value())))
       return OperationData(RecurKind::FMul);
 
+    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMax);
+    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMin);
+
     if (match(I, m_SMax(m_Value(), m_Value())))
       return OperationData(RecurKind::SMax);
     if (match(I, m_SMin(m_Value(), m_Value())))
@@ -7076,6 +7094,18 @@ class HorizontalReduction {
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
       break;
     }
+    case RecurKind::FMax:
+    case RecurKind::FMin: {
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+      VectorCost =
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                      /*pairwise=*/false, /*unsigned=*/false);
+      ScalarCost =
+          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                  CmpInst::makeCmpResultType(ScalarTy));
+      break;
+    }
     case RecurKind::SMax:
     case RecurKind::SMin:
     case RecurKind::UMax:
@@ -7307,6 +7337,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  return false;
+}
+
 /// Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding the phi node \a P
 /// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7347,7 +7387,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
     unsigned Level;
     std::tie(Inst, Level) = Stack.pop_back_val();
     Value *B0, *B1;
-    bool IsBinop = match(Inst, m_BinOp(m_Value(B0), m_Value(B1)));
+    bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
index 2c3efaa91567..a12bd31920c1 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -330,17 +330,16 @@ for.end:
 define float @fmin_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fmin_v4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]])
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]])
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP4:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT5:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP4]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
 ;
 entry:
   br label %for.cond

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
index e2754862399e..fc134aa6deef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -343,14 +343,10 @@ define float @reduction_v4f32_fast(float* %p) {
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -365,6 +361,8 @@ define float @reduction_v4f32_fast(float* %p) {
   ret float %m3
 }
 
+; TODO: This should become a reduce intrinsic.
+
 define float @reduction_v4f32_nnan(float* %p) {
 ; CHECK-LABEL: @reduction_v4f32_nnan(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -392,6 +390,8 @@ define float @reduction_v4f32_nnan(float* %p) {
   ret float %m3
 }
 
+; Negative test - must have nnan.
+
 define float @reduction_v4f32_not_fast(float* %p) {
 ; CHECK-LABEL: @reduction_v4f32_not_fast(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -428,22 +428,10 @@ define float @reduction_v8f32_fast(float* %p) {
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -490,14 +478,10 @@ define double @reduction_v4f64_fast(double* %p) {
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %g1 = getelementptr inbounds double, double* %p, i64 1
   %g2 = getelementptr inbounds double, double* %p, i64 2
@@ -512,6 +496,8 @@ define double @reduction_v4f64_fast(double* %p) {
   ret double %m3
 }
 
+; Negative test - must have nnan.
+
 define double @reduction_v4f64_wrong_fmf(double* %p) {
 ; CHECK-LABEL: @reduction_v4f64_wrong_fmf(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
index 15a7848f8eca..e5a4fc235748 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -343,14 +343,10 @@ define float @reduction_v4f32_fast(float* %p) {
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -365,6 +361,8 @@ define float @reduction_v4f32_fast(float* %p) {
   ret float %m3
 }
 
+; TODO: This should become a reduce intrinsic.
+
 define float @reduction_v4f32_nnan(float* %p) {
 ; CHECK-LABEL: @reduction_v4f32_nnan(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -392,6 +390,8 @@ define float @reduction_v4f32_nnan(float* %p) {
   ret float %m3
 }
 
+; Negative test - must have nnan.
+
 define float @reduction_v4f32_wrong_fmf(float* %p) {
 ; CHECK-LABEL: @reduction_v4f32_wrong_fmf(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -428,22 +428,10 @@ define float @reduction_v8f32_fast(float* %p) {
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.minnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.minnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.minnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.minnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -490,14 +478,10 @@ define double @reduction_v4f64_fast(double* %p) {
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %g1 = getelementptr inbounds double, double* %p, i64 1
   %g2 = getelementptr inbounds double, double* %p, i64 2
@@ -512,6 +496,8 @@ define double @reduction_v4f64_fast(double* %p) {
   ret double %m3
 }
 
+; Negative test - must have nnan.
+
 define double @reduction_v4f64_not_fast(double* %p) {
 ; CHECK-LABEL: @reduction_v4f64_not_fast(
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1