[llvm] 26bfbec - [Intrinsic] Introduce reduction intrinsics for minimum/maximum
Anna Thomas via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 13 09:30:10 PDT 2023
Author: Anna Thomas
Date: 2023-06-13T12:29:58-04:00
New Revision: 26bfbec5d2817f75b2cc3e680bc93e247e1d3263
URL: https://github.com/llvm/llvm-project/commit/26bfbec5d2817f75b2cc3e680bc93e247e1d3263
DIFF: https://github.com/llvm/llvm-project/commit/26bfbec5d2817f75b2cc3e680bc93e247e1d3263.diff
LOG: [Intrinsic] Introduce reduction intrinsics for minimum/maximum
This patch introduces the reduction intrinsic for floating point minimum
and maximum which has the same semantics (for NaN and signed zero) as
llvm.minimum and llvm.maximum.
Reviewed-By: nikic
Differential Revision: https://reviews.llvm.org/D152370
Added:
llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/include/llvm/IR/IRBuilder.h
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/lib/IR/IRBuilder.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index dd18e960673ad..b9d3589882278 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -17846,6 +17846,64 @@ Arguments:
""""""""""
The argument to this intrinsic must be a vector of floating-point values.
+.. _int_vector_reduce_fmaximum:
+
+'``llvm.vector.reduce.fmaximum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
+ declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.reduce.fmaximum.*``' intrinsics do a floating-point
+``MAX`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+This instruction has the same comparison semantics as the '``llvm.maximum.*``'
+intrinsic. That is, this intrinsic propagates NaNs and +0.0 is considered
+greater than -0.0. If any element of the vector is a NaN, the result is NaN.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating-point values.
+
+.. _int_vector_reduce_fminimum:
+
+'``llvm.vector.reduce.fminimum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
+ declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.reduce.fminimum.*``' intrinsics do a floating-point
+``MIN`` reduction of a vector, returning the result as a scalar. The return type
+matches the element-type of the vector input.
+
+This instruction has the same comparison semantics as the '``llvm.minimum.*``'
+intrinsic. That is, this intrinsic propagates NaNs and -0.0 is considered less
+than +0.0. If any element of the vector is a NaN, the result is NaN.
+
+Arguments:
+""""""""""
+The argument to this intrinsic must be a vector of floating-point values.
+
'``llvm.vector.insert``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 0b1d1d75151cb..fbcd3f9d1f80d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1318,6 +1318,10 @@ enum NodeType {
/// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
VECREDUCE_FMAX,
VECREDUCE_FMIN,
+ /// FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the
+ /// llvm.minimum and llvm.maximum semantics.
+ VECREDUCE_FMAXIMUM,
+ VECREDUCE_FMINIMUM,
/// Integer reductions may have a result type larger than the vector element
/// type. However, the reduction is performed using the vector element type
/// and the value in the top bits is unspecified.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 1e32908dd5373..e607982d21da5 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -756,6 +756,16 @@ class IRBuilderBase {
/// vector.
CallInst *CreateFPMinReduce(Value *Src);
+ /// Create a vector float maximum reduction intrinsic of the source
+ /// vector. This variant follows the NaN and signed zero semantic of
+ /// llvm.maximum intrinsic.
+ CallInst *CreateFPMaximumReduce(Value *Src);
+
+ /// Create a vector float minimum reduction intrinsic of the source
+ /// vector. This variant follows the NaN and signed zero semantic of
+ /// llvm.minimum intrinsic.
+ CallInst *CreateFPMinimumReduce(Value *Src);
+
/// Create a lifetime.start intrinsic.
///
/// If the pointer isn't i8* it will be converted.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index ebbd2af8efc7e..97bfb8f22fda5 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2323,6 +2323,10 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
[llvm_anyvector_ty]>;
def int_vector_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
[llvm_anyvector_ty]>;
+ def int_vector_reduce_fminimum: DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+ [llvm_anyvector_ty]>;
+ def int_vector_reduce_fmaximum: DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+ [llvm_anyvector_ty]>;
}
//===----- Matrix intrinsics ---------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b20cca050b67b..b96d7c34161cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2015,7 +2015,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::VECREDUCE_UMAX:
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
- case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N);
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM: return visitVECREDUCE(N);
#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
#include "llvm/IR/VPIntrinsics.def"
return visitVPOp(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b21a89bedc6e2..d4daaf6f7af00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1205,6 +1205,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
case ISD::IS_FPCLASS:
Action = TLI.getOperationAction(
Node->getOpcode(), Node->getOperand(0).getValueType());
@@ -4002,6 +4004,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Results.push_back(TLI.expandVecReduce(Node, DAG));
break;
case ISD::GLOBAL_OFFSET_TABLE:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 366a8d58d9c87..0cf37f1f82c57 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -145,6 +145,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
R = SoftenFloatRes_VECREDUCE(N);
break;
case ISD::VECREDUCE_SEQ_FADD:
@@ -2339,6 +2341,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
R = PromoteFloatRes_VECREDUCE(N);
break;
case ISD::VECREDUCE_SEQ_FADD:
@@ -2704,6 +2708,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
R = SoftPromoteHalfRes_VECREDUCE(N);
break;
case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 5980383d6627f..730a420e6a003 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -451,6 +451,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Action = TLI.getOperationAction(Node->getOpcode(),
Node->getOperand(0).getValueType());
break;
@@ -960,6 +962,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VECREDUCE_FMUL:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Results.push_back(TLI.expandVecReduce(Node, DAG));
return;
case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a1a150d5234b1..1892f4cffb4b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -696,6 +696,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Res = ScalarizeVecOp_VECREDUCE(N);
break;
case ISD::VECREDUCE_SEQ_FADD:
@@ -2924,6 +2926,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Res = SplitVecOp_VECREDUCE(N, OpNo);
break;
case ISD::VECREDUCE_SEQ_FADD:
@@ -5921,6 +5925,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VECREDUCE_UMIN:
case ISD::VECREDUCE_FMAX:
case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAXIMUM:
+ case ISD::VECREDUCE_FMINIMUM:
Res = WidenVecOp_VECREDUCE(N);
break;
case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d83aaf31950d5..c9d15fc0e40c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -455,6 +455,10 @@ ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
case ISD::VECREDUCE_FMIN:
case ISD::VP_REDUCE_FMIN:
return ISD::FMINNUM;
+ case ISD::VECREDUCE_FMAXIMUM:
+ return ISD::FMAXIMUM;
+ case ISD::VECREDUCE_FMINIMUM:
+ return ISD::FMINIMUM;
}
}
@@ -12393,6 +12397,18 @@ SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL,
return getConstantFP(NeutralAF, DL, VT);
}
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM: {
+ // Neutral element for fminimum is Inf or FLT_MAX, depending on FMF.
+ const fltSemantics &Semantics = EVTToAPFloatSemantics(VT);
+ APFloat NeutralAF = !Flags.hasNoInfs() ? APFloat::getInf(Semantics)
+ : APFloat::getLargest(Semantics);
+ if (Opcode == ISD::FMAXIMUM)
+ NeutralAF.changeSign();
+
+ return getConstantFP(NeutralAF, DL, VT);
+ }
+
}
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 267bdc02cdf05..8afe9f5315bcf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/CodeGenCommonISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -7289,6 +7290,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::vector_reduce_umin:
case Intrinsic::vector_reduce_fmax:
case Intrinsic::vector_reduce_fmin:
+ case Intrinsic::vector_reduce_fmaximum:
+ case Intrinsic::vector_reduce_fminimum:
visitVectorReduce(I, Intrinsic);
return;
@@ -10010,6 +10013,12 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
case Intrinsic::vector_reduce_fmin:
Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
break;
+ case Intrinsic::vector_reduce_fmaximum:
+ Res = DAG.getNode(ISD::VECREDUCE_FMAXIMUM, dl, VT, Op1, SDFlags);
+ break;
+ case Intrinsic::vector_reduce_fminimum:
+ Res = DAG.getNode(ISD::VECREDUCE_FMINIMUM, dl, VT, Op1, SDFlags);
+ break;
default:
llvm_unreachable("Unhandled vector reduce intrinsic");
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 0767d8b141ec0..a37d5235c4648 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -500,6 +500,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";
+ case ISD::VECREDUCE_FMAXIMUM: return "vecreduce_fmaximum";
+ case ISD::VECREDUCE_FMINIMUM: return "vecreduce_fminimum";
case ISD::STACKMAP:
return "stackmap";
case ISD::PATCHPOINT:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c32f86167afa5..1c54122e7448c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -882,7 +882,8 @@ void TargetLoweringBase::initActions() {
ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR,
ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_FMAX,
- ISD::VECREDUCE_FMIN, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL},
+ ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM,
+ ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL},
VT, Expand);
// Named vector shuffles default to expand.
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index fd5a00f7d4786..21f4e7ba64a87 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -482,6 +482,14 @@ CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src) {
return getReductionIntrinsic(Intrinsic::vector_reduce_fmin, Src);
}
+CallInst *IRBuilderBase::CreateFPMaximumReduce(Value *Src) {
+ return getReductionIntrinsic(Intrinsic::vector_reduce_fmaximum, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMinimumReduce(Value *Src) {
+ return getReductionIntrinsic(Intrinsic::vector_reduce_fminimum, Src);
+}
+
CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
new file mode 100644
index 0000000000000..82454564df68d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -0,0 +1,224 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon,+fullfp16 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP
+
+declare half @llvm.vector.reduce.fmaximum.v1f16(<1 x half> %a)
+declare float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
+declare double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
+declare fp128 @llvm.vector.reduce.fmaximum.v1f128(<1 x fp128> %a)
+
+declare half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
+declare half @llvm.vector.reduce.fmaximum.v11f16(<11 x half> %a)
+declare float @llvm.vector.reduce.fmaximum.v3f32(<3 x float> %a)
+declare fp128 @llvm.vector.reduce.fmaximum.v2f128(<2 x fp128> %a)
+declare float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %a)
+
+define half @test_v1f16(<1 x half> %a) nounwind {
+; CHECK-LABEL: test_v1f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %b = call half @llvm.vector.reduce.fmaximum.v1f16(<1 x half> %a)
+ ret half %b
+}
+
+define float @test_v1f32(<1 x float> %a) nounwind {
+; CHECK-LABEL: test_v1f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT: ret
+ %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
+ ret float %b
+}
+
+define double @test_v1f64(<1 x double> %a) nounwind {
+; CHECK-LABEL: test_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %b = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
+ ret double %b
+}
+
+define fp128 @test_v1f128(<1 x fp128> %a) nounwind {
+; CHECK-LABEL: test_v1f128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %b = call fp128 @llvm.vector.reduce.fmaximum.v1f128(<1 x fp128> %a)
+ ret fp128 %b
+}
+
+define half @test_v4f16(<4 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v4f16:
+; CHECK-NOFP: // %bb.0:
+; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-NEXT: mov h1, v0.h[1]
+; CHECK-NOFP-NEXT: fcvt s2, h0
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: fmax s1, s2, s1
+; CHECK-NOFP-NEXT: mov h2, v0.h[2]
+; CHECK-NOFP-NEXT: mov h0, v0.h[3]
+; CHECK-NOFP-NEXT: fcvt h1, s1
+; CHECK-NOFP-NEXT: fcvt s2, h2
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: fmax s1, s1, s2
+; CHECK-NOFP-NEXT: fcvt h1, s1
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: fmax s0, s1, s0
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: ret
+;
+; CHECK-FP-LABEL: test_v4f16:
+; CHECK-FP: // %bb.0:
+; CHECK-FP-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-NEXT: mov h1, v0.h[1]
+; CHECK-FP-NEXT: mov h2, v0.h[2]
+; CHECK-FP-NEXT: fmax h1, h0, h1
+; CHECK-FP-NEXT: mov h0, v0.h[3]
+; CHECK-FP-NEXT: fmax h1, h1, h2
+; CHECK-FP-NEXT: fmax h0, h1, h0
+; CHECK-FP-NEXT: ret
+ %b = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
+ ret half %b
+}
+
+define half @test_v11f16(<11 x half> %a) nounwind {
+; CHECK-NOFP-LABEL: test_v11f16:
+; CHECK-NOFP: // %bb.0:
+; CHECK-NOFP-NEXT: ldr h16, [sp, #8]
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: ldr h17, [sp]
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fcvt s2, h2
+; CHECK-NOFP-NEXT: fcvt s16, h16
+; CHECK-NOFP-NEXT: fcvt s17, h17
+; CHECK-NOFP-NEXT: fmax s1, s1, s16
+; CHECK-NOFP-NEXT: ldr h16, [sp, #16]
+; CHECK-NOFP-NEXT: fmax s0, s0, s17
+; CHECK-NOFP-NEXT: fcvt s16, h16
+; CHECK-NOFP-NEXT: fcvt h1, s1
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fmax s1, s2, s16
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt h1, s1
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fcvt s1, h1
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt s1, h3
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt s1, h4
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt s1, h5
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt s1, h6
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt s1, h7
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: fcvt s0, h0
+; CHECK-NOFP-NEXT: fmax s0, s0, s1
+; CHECK-NOFP-NEXT: fcvt h0, s0
+; CHECK-NOFP-NEXT: ret
+;
+; CHECK-FP-LABEL: test_v11f16:
+; CHECK-FP: // %bb.0:
+; CHECK-FP-NEXT: // kill: def $h0 killed $h0 def $q0
+; CHECK-FP-NEXT: // kill: def $h1 killed $h1 def $q1
+; CHECK-FP-NEXT: // kill: def $h2 killed $h2 def $q2
+; CHECK-FP-NEXT: // kill: def $h3 killed $h3 def $q3
+; CHECK-FP-NEXT: // kill: def $h4 killed $h4 def $q4
+; CHECK-FP-NEXT: mov x8, sp
+; CHECK-FP-NEXT: // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT: // kill: def $h6 killed $h6 def $q6
+; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7
+; CHECK-FP-NEXT: mov v0.h[1], v1.h[0]
+; CHECK-FP-NEXT: movi v1.8h, #252, lsl #8
+; CHECK-FP-NEXT: mov v0.h[2], v2.h[0]
+; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8]
+; CHECK-FP-NEXT: add x8, sp, #8
+; CHECK-FP-NEXT: mov v0.h[3], v3.h[0]
+; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8]
+; CHECK-FP-NEXT: add x8, sp, #16
+; CHECK-FP-NEXT: mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT: mov v0.h[5], v5.h[0]
+; CHECK-FP-NEXT: mov v0.h[6], v6.h[0]
+; CHECK-FP-NEXT: mov v0.h[7], v7.h[0]
+; CHECK-FP-NEXT: fmax v0.8h, v0.8h, v1.8h
+; CHECK-FP-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-FP-NEXT: fmax v0.4h, v0.4h, v1.4h
+; CHECK-FP-NEXT: mov h1, v0.h[1]
+; CHECK-FP-NEXT: mov h2, v0.h[2]
+; CHECK-FP-NEXT: fmax h1, h0, h1
+; CHECK-FP-NEXT: mov h0, v0.h[3]
+; CHECK-FP-NEXT: fmax h1, h1, h2
+; CHECK-FP-NEXT: fmax h0, h1, h0
+; CHECK-FP-NEXT: ret
+ %b = call half @llvm.vector.reduce.fmaximum.v11f16(<11 x half> %a)
+ ret half %b
+}
+
+; Neutral element is negative infinity which is chosen for padding the widened
+; vector.
+define float @test_v3f32(<3 x float> %a) nounwind {
+; CHECK-LABEL: test_v3f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-8388608 // =0xff800000
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: fmax s0, s0, s1
+; CHECK-NEXT: ret
+ %b = call float @llvm.vector.reduce.fmaximum.v3f32(<3 x float> %a)
+ ret float %b
+}
+
+; Neutral element chosen for padding the widened vector is not negative infinity.
+define float @test_v3f32_ninf(<3 x float> %a) nounwind {
+; CHECK-LABEL: test_v3f32_ninf:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-8388609 // =0xff7fffff
+; CHECK-NEXT: fmov s1, w8
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: fmax s0, s0, s1
+; CHECK-NEXT: ret
+ %b = call ninf float @llvm.vector.reduce.fmaximum.v3f32(<3 x float> %a)
+ ret float %b
+}
+
+; Cannot legalize f128. See PR63267 - The underlying fmaximum has no default
+; expansion and no libcalls.
+;define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
+; %b = call fp128 @llvm.vector.reduce.fmaximum.v2f128(<2 x fp128> %a)
+; ret fp128 %b
+;}
+
+define float @test_v16f32(<16 x float> %a) nounwind {
+; CHECK-LABEL: test_v16f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmax v1.4s, v1.4s, v3.4s
+; CHECK-NEXT: fmax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: fmax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: mov s1, v0.s[1]
+; CHECK-NEXT: fmax s0, s0, s1
+; CHECK-NEXT: ret
+ %b = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %a)
+ ret float %b
+}
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
new file mode 100644
index 0000000000000..301629f033dbc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -0,0 +1,1794 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL
+
+;
+; vXf32
+;
+
+define float @test_v1f32(<1 x float> %a0) {
+; ALL-LABEL: test_v1f32:
+; ALL: # %bb.0:
+; ALL-NEXT: retq
+ %1 = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a0)
+ ret float %1
+}
+
+define float @test_v2f32(<2 x float> %a0) {
+; SSE2-LABEL: test_v2f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: js .LBB1_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: .LBB1_2:
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: cmpunordss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB1_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: .LBB1_4:
+; SSE2-NEXT: maxss %xmm2, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm1
+; SSE2-NEXT: orps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: js .LBB1_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: .LBB1_2:
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: cmpunordss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: andps %xmm3, %xmm4
+; SSE41-NEXT: js .LBB1_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movaps %xmm0, %xmm2
+; SSE41-NEXT: .LBB1_4:
+; SSE41-NEXT: maxss %xmm2, %xmm3
+; SSE41-NEXT: andnps %xmm3, %xmm1
+; SSE41-NEXT: orps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB1_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: jmp .LBB1_3
+; AVX-NEXT: .LBB1_1:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: .LBB1_3:
+; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vmovaps %xmm0, %xmm2
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %1 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a0)
+ ret float %1
+}
+
+define float @test_v4f32(<4 x float> %a0) {
+; SSE2-LABEL: test_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: js .LBB2_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movaps %xmm3, %xmm4
+; SSE2-NEXT: .LBB2_2:
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT: movaps %xmm4, %xmm2
+; SSE2-NEXT: cmpunordss %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm5
+; SSE2-NEXT: andps %xmm4, %xmm5
+; SSE2-NEXT: js .LBB2_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: .LBB2_4:
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm2
+; SSE2-NEXT: orps %xmm5, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm2, %xmm4
+; SSE2-NEXT: js .LBB2_6
+; SSE2-NEXT: # %bb.5:
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: .LBB2_6:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: movaps %xmm4, %xmm3
+; SSE2-NEXT: cmpunordss %xmm4, %xmm3
+; SSE2-NEXT: movaps %xmm3, %xmm5
+; SSE2-NEXT: andps %xmm4, %xmm5
+; SSE2-NEXT: js .LBB2_8
+; SSE2-NEXT: # %bb.7:
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: .LBB2_8:
+; SSE2-NEXT: maxss %xmm1, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm3
+; SSE2-NEXT: orps %xmm5, %xmm3
+; SSE2-NEXT: movd %xmm3, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: js .LBB2_10
+; SSE2-NEXT: # %bb.9:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: .LBB2_10:
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: cmpunordss %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: andps %xmm2, %xmm4
+; SSE2-NEXT: js .LBB2_12
+; SSE2-NEXT: # %bb.11:
+; SSE2-NEXT: movaps %xmm3, %xmm0
+; SSE2-NEXT: .LBB2_12:
+; SSE2-NEXT: maxss %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: js .LBB2_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: .LBB2_2:
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: cmpunordss %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm5
+; SSE41-NEXT: andps %xmm4, %xmm5
+; SSE41-NEXT: js .LBB2_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: .LBB2_4:
+; SSE41-NEXT: maxss %xmm3, %xmm4
+; SSE41-NEXT: andnps %xmm4, %xmm2
+; SSE41-NEXT: orps %xmm5, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm2, %xmm4
+; SSE41-NEXT: js .LBB2_6
+; SSE41-NEXT: # %bb.5:
+; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: .LBB2_6:
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE41-NEXT: movaps %xmm4, %xmm3
+; SSE41-NEXT: cmpunordss %xmm4, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm5
+; SSE41-NEXT: andps %xmm4, %xmm5
+; SSE41-NEXT: js .LBB2_8
+; SSE41-NEXT: # %bb.7:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: .LBB2_8:
+; SSE41-NEXT: maxss %xmm1, %xmm4
+; SSE41-NEXT: andnps %xmm4, %xmm3
+; SSE41-NEXT: orps %xmm5, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: js .LBB2_10
+; SSE41-NEXT: # %bb.9:
+; SSE41-NEXT: movaps %xmm0, %xmm2
+; SSE41-NEXT: .LBB2_10:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: andps %xmm2, %xmm4
+; SSE41-NEXT: js .LBB2_12
+; SSE41-NEXT: # %bb.11:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: .LBB2_12:
+; SSE41-NEXT: maxss %xmm0, %xmm2
+; SSE41-NEXT: andnps %xmm2, %xmm1
+; SSE41-NEXT: orps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB2_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovaps %xmm0, %xmm3
+; AVX-NEXT: jmp .LBB2_3
+; AVX-NEXT: .LBB2_1:
+; AVX-NEXT: vmovaps %xmm2, %xmm3
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: .LBB2_3:
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm3
+; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm4
+; AVX-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm3
+; AVX-NEXT: vmovd %xmm3, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB2_4
+; AVX-NEXT: # %bb.5:
+; AVX-NEXT: vmovaps %xmm3, %xmm2
+; AVX-NEXT: jmp .LBB2_6
+; AVX-NEXT: .LBB2_4:
+; AVX-NEXT: vmovapd %xmm1, %xmm2
+; AVX-NEXT: vmovaps %xmm3, %xmm1
+; AVX-NEXT: .LBB2_6:
+; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB2_7
+; AVX-NEXT: # %bb.8:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB2_9
+; AVX-NEXT: .LBB2_7:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: .LBB2_9:
+; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm4
+; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm3
+; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmaxss %xmm3, %xmm2, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
+; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: testl %eax, %eax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm2
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a0)
+ ret float %1
+}
+
+define float @test_v8f32(<8 x float> %a0) {
+; SSE2-LABEL: test_v8f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: maxps %xmm1, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: cmpunordps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: js .LBB3_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB3_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: cmpunordss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB3_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: .LBB3_4:
+; SSE2-NEXT: maxss %xmm2, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm1
+; SSE2-NEXT: orps %xmm4, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: js .LBB3_6
+; SSE2-NEXT: # %bb.5:
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: .LBB3_6:
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: cmpunordss %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm5
+; SSE2-NEXT: andps %xmm4, %xmm5
+; SSE2-NEXT: js .LBB3_8
+; SSE2-NEXT: # %bb.7:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB3_8:
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm2
+; SSE2-NEXT: orps %xmm5, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: js .LBB3_10
+; SSE2-NEXT: # %bb.9:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB3_10:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpunordss %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB3_12
+; SSE2-NEXT: # %bb.11:
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: .LBB3_12:
+; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: maxps %xmm3, %xmm2
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: js .LBB3_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: .LBB3_2:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: andps %xmm3, %xmm4
+; SSE41-NEXT: js .LBB3_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: .LBB3_4:
+; SSE41-NEXT: maxss %xmm1, %xmm3
+; SSE41-NEXT: andnps %xmm3, %xmm0
+; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: js .LBB3_6
+; SSE41-NEXT: # %bb.5:
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: .LBB3_6:
+; SSE41-NEXT: movaps %xmm4, %xmm1
+; SSE41-NEXT: cmpunordss %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm5
+; SSE41-NEXT: andps %xmm4, %xmm5
+; SSE41-NEXT: js .LBB3_8
+; SSE41-NEXT: # %bb.7:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: .LBB3_8:
+; SSE41-NEXT: maxss %xmm3, %xmm4
+; SSE41-NEXT: andnps %xmm4, %xmm1
+; SSE41-NEXT: orps %xmm5, %xmm1
+; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: js .LBB3_10
+; SSE41-NEXT: # %bb.9:
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: .LBB3_10:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: andps %xmm3, %xmm4
+; SSE41-NEXT: js .LBB3_12
+; SSE41-NEXT: # %bb.11:
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: .LBB3_12:
+; SSE41-NEXT: maxss %xmm2, %xmm3
+; SSE41-NEXT: andnps %xmm3, %xmm0
+; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB3_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: jmp .LBB3_3
+; AVX-NEXT: .LBB3_1:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: .LBB3_3:
+; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vmovd %xmm2, %eax
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB3_4
+; AVX-NEXT: # %bb.5:
+; AVX-NEXT: vmovaps %xmm2, %xmm3
+; AVX-NEXT: jmp .LBB3_6
+; AVX-NEXT: .LBB3_4:
+; AVX-NEXT: vmovapd %xmm1, %xmm3
+; AVX-NEXT: vmovaps %xmm2, %xmm1
+; AVX-NEXT: .LBB3_6:
+; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB3_7
+; AVX-NEXT: # %bb.8:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB3_9
+; AVX-NEXT: .LBB3_7:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: .LBB3_9:
+; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovaps %xmm0, %xmm2
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmovaps %xmm2, %xmm3
+; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vmovaps %xmm2, %xmm0
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vblendmps %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxps %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordps %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovaps %xmm0, %xmm2
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmovaps %xmm2, %xmm3
+; AVX512VL-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a0)
+ ret float %1
+}
+
+define float @test_v16f32(<16 x float> %a0) {
+; SSE2-LABEL: test_v16f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pandn %xmm0, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pandn %xmm2, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: maxps %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: cmpunordps %xmm5, %xmm0
+; SSE2-NEXT: andps %xmm0, %xmm5
+; SSE2-NEXT: andnps %xmm6, %xmm0
+; SSE2-NEXT: orps %xmm5, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: maxps %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: cmpunordps %xmm4, %xmm3
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: andnps %xmm1, %xmm3
+; SSE2-NEXT: orps %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: maxps %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: cmpunordps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: js .LBB4_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB4_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: cmpunordss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB4_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: .LBB4_4:
+; SSE2-NEXT: maxss %xmm2, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm1
+; SSE2-NEXT: orps %xmm4, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: js .LBB4_6
+; SSE2-NEXT: # %bb.5:
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: .LBB4_6:
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: cmpunordss %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm5
+; SSE2-NEXT: andps %xmm4, %xmm5
+; SSE2-NEXT: js .LBB4_8
+; SSE2-NEXT: # %bb.7:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB4_8:
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm2
+; SSE2-NEXT: orps %xmm5, %xmm2
+; SSE2-NEXT: movd %xmm2, %eax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: js .LBB4_10
+; SSE2-NEXT: # %bb.9:
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: .LBB4_10:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpunordss %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: js .LBB4_12
+; SSE2-NEXT: # %bb.11:
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: .LBB4_12:
+; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: andnps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16f32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: movaps %xmm1, %xmm5
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm6
+; SSE41-NEXT: maxps %xmm5, %xmm6
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordps %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6
+; SSE41-NEXT: movaps %xmm4, %xmm3
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxps %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: cmpunordps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm2
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movaps %xmm6, %xmm1
+; SSE41-NEXT: maxps %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm6, %xmm0
+; SSE41-NEXT: cmpunordps %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: js .LBB4_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: .LBB4_2:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: andps %xmm3, %xmm4
+; SSE41-NEXT: js .LBB4_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: .LBB4_4:
+; SSE41-NEXT: maxss %xmm2, %xmm3
+; SSE41-NEXT: andnps %xmm3, %xmm0
+; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: movd %xmm0, %eax
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: js .LBB4_6
+; SSE41-NEXT: # %bb.5:
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: .LBB4_6:
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: cmpunordss %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm5
+; SSE41-NEXT: andps %xmm4, %xmm5
+; SSE41-NEXT: js .LBB4_8
+; SSE41-NEXT: # %bb.7:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: .LBB4_8:
+; SSE41-NEXT: maxss %xmm3, %xmm4
+; SSE41-NEXT: andnps %xmm4, %xmm2
+; SSE41-NEXT: orps %xmm5, %xmm2
+; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: js .LBB4_10
+; SSE41-NEXT: # %bb.9:
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: .LBB4_10:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm4
+; SSE41-NEXT: andps %xmm3, %xmm4
+; SSE41-NEXT: js .LBB4_12
+; SSE41-NEXT: # %bb.11:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: .LBB4_12:
+; SSE41-NEXT: maxss %xmm1, %xmm3
+; SSE41-NEXT: andnps %xmm3, %xmm0
+; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v16f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmaxps %ymm2, %ymm0, %ymm1
+; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2
+; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB4_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: jmp .LBB4_3
+; AVX-NEXT: .LBB4_1:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: .LBB4_3:
+; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vmovd %xmm2, %eax
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB4_4
+; AVX-NEXT: # %bb.5:
+; AVX-NEXT: vmovaps %xmm2, %xmm3
+; AVX-NEXT: jmp .LBB4_6
+; AVX-NEXT: .LBB4_4:
+; AVX-NEXT: vmovapd %xmm1, %xmm3
+; AVX-NEXT: vmovaps %xmm2, %xmm1
+; AVX-NEXT: .LBB4_6:
+; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: js .LBB4_7
+; AVX-NEXT: # %bb.8:
+; AVX-NEXT: vmovaps %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB4_9
+; AVX-NEXT: .LBB4_7:
+; AVX-NEXT: vmovaps %xmm0, %xmm2
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: .LBB4_9:
+; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512BW-LABEL: test_v16f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmaxps %ymm2, %ymm0, %ymm1
+; AVX512BW-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovaps %xmm0, %xmm2
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmovaps %xmm2, %xmm3
+; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovd %xmm2, %eax
+; AVX512BW-NEXT: testl %eax, %eax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vmovaps %xmm2, %xmm0
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v16f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vblendmps %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT: vmaxps %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: vcmpunordps %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vblendmps %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxps %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordps %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovaps %xmm0, %xmm2
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmovaps %xmm2, %xmm3
+; AVX512VL-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm3, %xmm1, %xmm2
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: testl %eax, %eax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %1 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %a0)
+ ret float %1
+}
+
+;
+; vXf64
+;
+
+define double @test_v2f64(<2 x double> %a0) {
+; SSE-LABEL: test_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: js .LBB5_2
+; SSE-NEXT: # %bb.1:
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: .LBB5_2:
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: cmpunordsd %xmm3, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm4
+; SSE-NEXT: andpd %xmm3, %xmm4
+; SSE-NEXT: js .LBB5_4
+; SSE-NEXT: # %bb.3:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: .LBB5_4:
+; SSE-NEXT: maxsd %xmm2, %xmm3
+; SSE-NEXT: andnpd %xmm3, %xmm1
+; SSE-NEXT: orpd %xmm4, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: js .LBB5_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovapd %xmm0, %xmm2
+; AVX-NEXT: jmp .LBB5_3
+; AVX-NEXT: .LBB5_1:
+; AVX-NEXT: vmovapd %xmm1, %xmm2
+; AVX-NEXT: vmovapd %xmm0, %xmm1
+; AVX-NEXT: .LBB5_3:
+; AVX-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: testq %rax, %rax
+; AVX512-NEXT: sets %al
+; AVX512-NEXT: kmovd %eax, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm2
+; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: retq
+ %1 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a0)
+ ret double %1
+}
+
+define double @test_v4f64(<4 x double> %a0) {
+; SSE2-LABEL: test_v4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: maxpd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm3, %xmm1
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: andnpd %xmm0, %xmm1
+; SSE2-NEXT: orpd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: js .LBB6_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB6_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: andpd %xmm3, %xmm4
+; SSE2-NEXT: js .LBB6_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: .LBB6_4:
+; SSE2-NEXT: maxsd %xmm2, %xmm3
+; SSE2-NEXT: andnpd %xmm3, %xmm0
+; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: maxpd %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE41-NEXT: movq %xmm2, %rax
+; SSE41-NEXT: testq %rax, %rax
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: js .LBB6_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: .LBB6_2:
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm0, %xmm4
+; SSE41-NEXT: andpd %xmm3, %xmm4
+; SSE41-NEXT: js .LBB6_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: .LBB6_4:
+; SSE41-NEXT: maxsd %xmm1, %xmm3
+; SSE41-NEXT: andnpd %xmm3, %xmm0
+; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: js .LBB6_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovapd %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB6_3
+; AVX-NEXT: .LBB6_1:
+; AVX-NEXT: vmovapd %xmm0, %xmm2
+; AVX-NEXT: vmovapd %xmm1, %xmm0
+; AVX-NEXT: .LBB6_3:
+; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512BW-LABEL: test_v4f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: testq %rax, %rax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: testq %rax, %rax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %1 = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %a0)
+ ret double %1
+}
+
+define double @test_v8f64(<8 x double> %a0) {
+; SSE2-LABEL: test_v8f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[3,3]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: movdqa %xmm6, %xmm7
+; SSE2-NEXT: pandn %xmm2, %xmm7
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm0
+; SSE2-NEXT: maxpd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm6, %xmm2
+; SSE2-NEXT: cmpunordpd %xmm6, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm6
+; SSE2-NEXT: andnpd %xmm0, %xmm2
+; SSE2-NEXT: orpd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm1, %xmm6
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[3,3]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: maxpd %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: cmpunordpd %xmm4, %xmm3
+; SSE2-NEXT: andpd %xmm3, %xmm4
+; SSE2-NEXT: andnpd %xmm1, %xmm3
+; SSE2-NEXT: orpd %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: maxpd %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm1
+; SSE2-NEXT: andpd %xmm1, %xmm0
+; SSE2-NEXT: andnpd %xmm2, %xmm1
+; SSE2-NEXT: orpd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: js .LBB7_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB7_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: andpd %xmm3, %xmm4
+; SSE2-NEXT: js .LBB7_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: .LBB7_4:
+; SSE2-NEXT: maxsd %xmm2, %xmm3
+; SSE2-NEXT: andnpd %xmm3, %xmm0
+; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm6
+; SSE41-NEXT: maxpd %xmm5, %xmm6
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE41-NEXT: movapd %xmm4, %xmm3
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxpd %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm1
+; SSE41-NEXT: maxpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm6, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: movq %xmm1, %rax
+; SSE41-NEXT: testq %rax, %rax
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: js .LBB7_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: .LBB7_2:
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm0, %xmm4
+; SSE41-NEXT: andpd %xmm3, %xmm4
+; SSE41-NEXT: js .LBB7_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: .LBB7_4:
+; SSE41-NEXT: maxsd %xmm2, %xmm3
+; SSE41-NEXT: andnpd %xmm3, %xmm0
+; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v8f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
+; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: js .LBB7_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovapd %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB7_3
+; AVX-NEXT: .LBB7_1:
+; AVX-NEXT: vmovapd %xmm0, %xmm2
+; AVX-NEXT: vmovapd %xmm1, %xmm0
+; AVX-NEXT: .LBB7_3:
+; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
+; AVX512BW-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: testq %rax, %rax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vblendmpd %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT: vmovapd %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: vcmpunordpd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: testq %rax, %rax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %1 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %a0)
+ ret double %1
+}
+
+define double @test_v16f64(<16 x double> %a0) {
+; SSE2-LABEL: test_v16f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm9
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[3,3]
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: pandn %xmm0, %xmm9
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pandn %xmm4, %xmm11
+; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: por %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: por %xmm11, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: maxpd %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm10, %xmm4
+; SSE2-NEXT: cmpunordpd %xmm10, %xmm4
+; SSE2-NEXT: andpd %xmm4, %xmm10
+; SSE2-NEXT: andnpd %xmm0, %xmm4
+; SSE2-NEXT: orpd %xmm10, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
+; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm9
+; SSE2-NEXT: movaps %xmm2, %xmm10
+; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[3,3]
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm10
+; SSE2-NEXT: pandn %xmm2, %xmm10
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pandn %xmm6, %xmm12
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: por %xmm10, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm11
+; SSE2-NEXT: por %xmm12, %xmm11
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: maxpd %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm11, %xmm6
+; SSE2-NEXT: cmpunordpd %xmm11, %xmm6
+; SSE2-NEXT: andpd %xmm6, %xmm11
+; SSE2-NEXT: andnpd %xmm2, %xmm6
+; SSE2-NEXT: orpd %xmm11, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: por %xmm9, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: maxpd %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm4, %xmm2
+; SSE2-NEXT: orpd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE2-NEXT: xorpd %xmm0, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: movaps %xmm1, %xmm6
+; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[3,3]
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm6
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pandn %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: pand %xmm1, %xmm9
+; SSE2-NEXT: por %xmm10, %xmm9
+; SSE2-NEXT: movdqa %xmm9, %xmm1
+; SSE2-NEXT: maxpd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm5
+; SSE2-NEXT: cmpunordpd %xmm9, %xmm5
+; SSE2-NEXT: andpd %xmm5, %xmm9
+; SSE2-NEXT: andnpd %xmm1, %xmm5
+; SSE2-NEXT: orpd %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: movaps %xmm3, %xmm9
+; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[3,3]
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: pandn %xmm3, %xmm9
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: pandn %xmm7, %xmm10
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: por %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm3, %xmm8
+; SSE2-NEXT: por %xmm10, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: maxpd %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: cmpunordpd %xmm8, %xmm7
+; SSE2-NEXT: andpd %xmm7, %xmm8
+; SSE2-NEXT: andnpd %xmm3, %xmm7
+; SSE2-NEXT: orpd %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: maxpd %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: cmpunordpd %xmm1, %xmm5
+; SSE2-NEXT: andpd %xmm5, %xmm1
+; SSE2-NEXT: andnpd %xmm3, %xmm5
+; SSE2-NEXT: orpd %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: maxpd %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm1
+; SSE2-NEXT: andpd %xmm1, %xmm0
+; SSE2-NEXT: andnpd %xmm2, %xmm1
+; SSE2-NEXT: orpd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: js .LBB8_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB8_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: andpd %xmm3, %xmm4
+; SSE2-NEXT: js .LBB8_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: .LBB8_4:
+; SSE2-NEXT: maxsd %xmm2, %xmm3
+; SSE2-NEXT: andnpd %xmm3, %xmm0
+; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm1, %xmm8
+; SSE41-NEXT: movapd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm9
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm10
+; SSE41-NEXT: maxpd %xmm9, %xmm10
+; SSE41-NEXT: movapd %xmm7, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm7, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10
+; SSE41-NEXT: movapd %xmm8, %xmm7
+; SSE41-NEXT: movapd %xmm8, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
+; SSE41-NEXT: movapd %xmm5, %xmm3
+; SSE41-NEXT: maxpd %xmm7, %xmm3
+; SSE41-NEXT: movapd %xmm5, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm5
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
+; SSE41-NEXT: movapd %xmm10, %xmm3
+; SSE41-NEXT: maxpd %xmm5, %xmm3
+; SSE41-NEXT: movapd %xmm10, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm10, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm5
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
+; SSE41-NEXT: movapd %xmm6, %xmm2
+; SSE41-NEXT: maxpd %xmm5, %xmm2
+; SSE41-NEXT: movapd %xmm6, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: maxpd %xmm5, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxpd %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: maxpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT: movq %xmm1, %rax
+; SSE41-NEXT: testq %rax, %rax
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: js .LBB8_2
+; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: .LBB8_2:
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm0, %xmm4
+; SSE41-NEXT: andpd %xmm3, %xmm4
+; SSE41-NEXT: js .LBB8_4
+; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: .LBB8_4:
+; SSE41-NEXT: maxsd %xmm2, %xmm3
+; SSE41-NEXT: andnpd %xmm3, %xmm0
+; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v16f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm4
+; AVX-NEXT: vblendvpd %ymm1, %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vmaxpd %ymm4, %ymm1, %ymm3
+; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm4
+; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm3
+; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vmaxpd %ymm3, %ymm0, %ymm2
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
+; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: testq %rax, %rax
+; AVX-NEXT: js .LBB8_1
+; AVX-NEXT: # %bb.2:
+; AVX-NEXT: vmovapd %xmm1, %xmm2
+; AVX-NEXT: jmp .LBB8_3
+; AVX-NEXT: .LBB8_1:
+; AVX-NEXT: vmovapd %xmm0, %xmm2
+; AVX-NEXT: vmovapd %xmm1, %xmm0
+; AVX-NEXT: .LBB8_3:
+; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512BW-LABEL: test_v16f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
+; AVX512BW-NEXT: vblendmpd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: vcmpunordpd %zmm1, %zmm1, %k1
+; AVX512BW-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
+; AVX512BW-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
+; AVX512BW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: testq %rax, %rax
+; AVX512BW-NEXT: sets %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v16f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
+; AVX512VL-NEXT: vblendmpd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0
+; AVX512VL-NEXT: vcmpunordpd %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vmovapd %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vblendmpd %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT: vmovapd %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: vcmpunordpd %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: testq %rax, %rax
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %1 = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> %a0)
+ ret double %1
+}
+
+declare float @llvm.vector.reduce.fmaximum.v1f32(<1 x float>)
+declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
+declare float @llvm.vector.reduce.fmaximum.v3f32(<3 x float>)
+declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>)
+declare float @llvm.vector.reduce.fmaximum.v16f32(<16 x float>)
+
+declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
+declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
+declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
+declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
More information about the llvm-commits
mailing list