[llvm] [X86] Lower `minimum`/`maximum`/`minimumnum`/`maximumnum` using bitwise operations (PR #170069)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 1 02:27:23 PST 2025
https://github.com/valadaptive updated https://github.com/llvm/llvm-project/pull/170069
>From 5dfeb5ce600fee260a47a1bf1d2fa5e36005bbf4 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Sun, 30 Nov 2025 21:22:57 -0500
Subject: [PATCH 1/2] [X86][ISelLowering] Lower `minimum[num]`/`maximum[num]`
using bitwise ops
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 121 +-
.../X86/avx512fp16-fminimum-fmaximum.ll | 42 +-
llvm/test/CodeGen/X86/extractelement-fp.ll | 132 +-
llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 1351 ++++------
.../CodeGen/X86/fminimumnum-fmaximumnum.ll | 1675 +++++-------
.../CodeGen/X86/vector-reduce-fmaximum.ll | 2307 +++++++----------
6 files changed, 2129 insertions(+), 3499 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1b0bf6823e390..26818a26ad658 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29480,7 +29480,6 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
uint64_t SizeInBits = VT.getScalarSizeInBits();
APInt PreferredZero = APInt::getZero(SizeInBits);
APInt OppositeZero = PreferredZero;
- EVT IVT = VT.changeTypeToInteger();
X86ISD::NodeType MinMaxOp;
if (IsMaxOp) {
MinMaxOp = X86ISD::FMAX;
@@ -29492,8 +29491,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
EVT SetCCType =
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
- // The tables below show the expected result of Max in cases of NaN and
- // signed zeros.
+ // The tables below show the expected result of Max in cases of NaN and signed
+ // zeros.
//
// Y Y
// Num xNaN +0 -0
@@ -29503,12 +29502,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
// xNaN | X | X/Y | -0 | +0 | -0 |
// --------------- ---------------
//
- // It is achieved by means of FMAX/FMIN with preliminary checks and operand
- // reordering.
- //
- // We check if any of operands is NaN and return NaN. Then we check if any of
- // operands is zero or negative zero (for fmaximum and fminimum respectively)
- // to ensure the correct zero is returned.
+ // It is achieved by means of FMAX/FMIN with preliminary checks, operand
+ // reordering if one operand is a constant, and bitwise operations and selects
+ // to handle signed zero and NaN operands otherwise.
auto MatchesZero = [](SDValue Op, APInt Zero) {
Op = peekThroughBitcasts(Op);
if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
@@ -29539,15 +29535,17 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
Op->getFlags().hasNoSignedZeros() ||
DAG.isKnownNeverZeroFloat(X) ||
DAG.isKnownNeverZeroFloat(Y);
- SDValue NewX, NewY;
+ bool ShouldHandleZeros = true;
+ SDValue NewX = X;
+ SDValue NewY = Y;
if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
MatchesZero(X, OppositeZero)) {
// Operands are already in right order or order does not matter.
- NewX = X;
- NewY = Y;
+ ShouldHandleZeros = false;
} else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
NewX = Y;
NewY = X;
+ ShouldHandleZeros = false;
} else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
(Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
if (IsXNeverNaN)
@@ -29569,33 +29567,6 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
- } else {
- SDValue IsXSigned;
- if (Subtarget.is64Bit() || VT != MVT::f64) {
- SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
- SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
- IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
- } else {
- assert(VT == MVT::f64);
- SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
- DAG.getConstantFP(0, DL, MVT::v2f64), X,
- DAG.getVectorIdxConstant(0, DL));
- SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
- DAG.getVectorIdxConstant(1, DL));
- Hi = DAG.getBitcast(MVT::i32, Hi);
- SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
- EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), MVT::i32);
- IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
- }
- if (MinMaxOp == X86ISD::FMAX) {
- NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
- NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
- } else {
- NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
- NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
- }
}
bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
@@ -29612,10 +29583,80 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
+ // We handle signed-zero ordering by taking the larger (or smaller) sign bit.
+ if (ShouldHandleZeros) {
+ const fltSemantics &Sem = VT.getFltSemantics();
+ unsigned EltBits = VT.getScalarSizeInBits();
+ bool IsFakeVector = !VT.isVector();
+ MVT LogicVT = VT.getSimpleVT();
+ if (IsFakeVector)
+ LogicVT = (VT == MVT::f64) ? MVT::v2f64
+ : (VT == MVT::f32) ? MVT::v4f32
+ : MVT::v8f16;
+
+ // We take the sign bit from the first operand and combine it with the
+ // output sign bit (see below). Right now, if ShouldHandleZeros is true, the
+ // operands will never have been swapped. If you add another optimization
+ // that swaps the input operands if one is a known value, make sure this
+ // logic stays correct!
+ SDValue LogicX = NewX;
+ SDValue LogicMinMax = MinMax;
+ if (IsFakeVector) {
+ // Promote scalars to vectors for bitwise operations.
+ LogicX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, LogicVT, NewX);
+ LogicMinMax = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, LogicVT, MinMax);
+ }
+
+ // x86's min/max operations return the second operand if both inputs are
+ // signed zero. For the maximum operation, we want to "and" the sign bit of
+ // the output with the sign bit of the first operand--that means that if the
+ // first operand is +0.0, the output will be too. For the minimum, it's the
+ // opposite: we "or" the output sign bit with the sign bit of the first
+ // operand, ensuring that if the first operand is -0.0, the output will be
+ // too.
+ SDValue Result;
+ if (IsMaxOp) {
+ // getSignedMaxValue returns a bit pattern of all ones but the highest
+ // bit. We "or" that with the first operand, then "and" that with the max
+ // operation's result. That clears only the sign bit, and only if the
+ // first operand is positive.
+ SDValue OrMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignedMaxValue(EltBits)), DL, LogicVT);
+ SDValue MaskedSignBit =
+ DAG.getNode(X86ISD::FOR, DL, LogicVT, LogicX, OrMask);
+ Result =
+ DAG.getNode(X86ISD::FAND, DL, LogicVT, MaskedSignBit, LogicMinMax);
+ } else {
+ // Likewise, getSignMask returns a bit pattern with only the highest bit
+ // set. This one *sets* only the sign bit, and only if the first operand
+ // is *negative*.
+ SDValue AndMask = DAG.getConstantFP(
+ APFloat(Sem, APInt::getSignMask(EltBits)), DL, LogicVT);
+ SDValue MaskedSignBit =
+ DAG.getNode(X86ISD::FAND, DL, LogicVT, LogicX, AndMask);
+ Result =
+ DAG.getNode(X86ISD::FOR, DL, LogicVT, MaskedSignBit, LogicMinMax);
+ }
+
+ // Extract scalar back from vector.
+ if (IsFakeVector)
+ MinMax = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Result,
+ DAG.getVectorIdxConstant(0, DL));
+ else
+ MinMax = Result;
+ }
+
if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
return MinMax;
- SDValue NaNSrc = IsNum ? MinMax : NewX;
+ // The x86 min/max return the second operand if either is NaN, which doesn't
+ // match the numeric or non-numeric semantics. For the non-numeric versions,
+ // we want to return NaN if either operand is NaN. To do that, we check if
+ // NewX (the first operand) is NaN, and select it if so. For the numeric
+ // versions, we want to return the non-NaN operand if there is one. So we
+ // check if NewY (the second operand) is NaN, and again select the first
+ // operand if so.
+ SDValue NaNSrc = IsNum ? NewY : NewX;
SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
index 53ac283170a5f..59cf38f82b7c0 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll
@@ -13,14 +13,9 @@ declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
define half @test_fminimum(half %x, half %y) {
; CHECK-LABEL: test_fminimum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovw %xmm0, %eax
-; CHECK-NEXT: testw %ax, %ax
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovaps %xmm1, %xmm2
-; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1}
-; CHECK-NEXT: vminsh %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpternlogq {{.*#+}} xmm1 = (xmm1 & xmm0) | xmm2
; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
@@ -92,16 +87,12 @@ define half @test_fminimum_combine_cmps(half %x, half %y) {
define half @test_fmaximum(half %x, half %y) {
; CHECK-LABEL: test_fmaximum:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovw %xmm0, %eax
-; CHECK-NEXT: testw %ax, %ax
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm2
-; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq {{.*#+}} xmm1 = xmm2 & (xmm1 | xmm0)
+; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmaxsh %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%r = call half @llvm.maximum.f16(half %x, half %y)
ret half %r
@@ -196,10 +187,9 @@ define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "n
define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
; CHECK-LABEL: test_fminimum_v32f16_szero:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovw2m %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
-; CHECK-NEXT: vminph %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm2) | zmm1
; CHECK-NEXT: retq
%r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
ret <32 x half> %r
@@ -208,12 +198,12 @@ define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "
define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovw2m %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 & (zmm1 | zmm0)
+; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1
; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmaxph %zmm2, %zmm1, %zmm0
-; CHECK-NEXT: vcmpunordph %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
ret <32 x half> %r
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 1706f17eac165..9ce1fd6f20976 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -675,37 +675,23 @@ define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; X64-LABEL: fmaximum_v4f32:
; X64: # %bb.0:
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: js .LBB30_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: vmovdqa %xmm0, %xmm2
-; X64-NEXT: jmp .LBB30_3
-; X64-NEXT: .LBB30_1:
-; X64-NEXT: vmovdqa %xmm1, %xmm2
-; X64-NEXT: vmovdqa %xmm0, %xmm1
-; X64-NEXT: .LBB30_3:
-; X64-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; X64-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X64-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; X64-NEXT: vorps %xmm2, %xmm0, %xmm2
+; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X64-NEXT: vandps %xmm1, %xmm2, %xmm1
+; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: fmaximum_v4f32:
; X86: # %bb.0:
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB30_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: jmp .LBB30_3
-; X86-NEXT: .LBB30_1:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: .LBB30_3:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; X86-NEXT: vorps %xmm2, %xmm0, %xmm2
+; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandps %xmm1, %xmm2, %xmm1
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -718,41 +704,25 @@ define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; X64-LABEL: fmaximum_v4f64:
; X64: # %bb.0:
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: testq %rax, %rax
-; X64-NEXT: js .LBB31_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: vmovdqa %xmm0, %xmm2
-; X64-NEXT: jmp .LBB31_3
-; X64-NEXT: .LBB31_1:
-; X64-NEXT: vmovdqa %xmm1, %xmm2
-; X64-NEXT: vmovdqa %xmm0, %xmm1
-; X64-NEXT: .LBB31_3:
-; X64-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
-; X64-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; X64-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; X64-NEXT: vandpd %xmm1, %xmm2, %xmm1
+; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: fmaximum_v4f64:
; X86: # %bb.0:
-; X86-NEXT: vextractps $1, %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB31_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovapd %xmm0, %xmm2
-; X86-NEXT: jmp .LBB31_3
-; X86-NEXT: .LBB31_1:
-; X86-NEXT: vmovapd %xmm1, %xmm2
-; X86-NEXT: vmovapd %xmm0, %xmm1
-; X86-NEXT: .LBB31_3:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp
-; X86-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
-; X86-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vandpd %xmm1, %xmm2, %xmm1
+; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -767,35 +737,21 @@ define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
; X64-LABEL: fminimum_v4f32:
; X64: # %bb.0:
-; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: js .LBB32_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: vmovdqa %xmm1, %xmm2
-; X64-NEXT: jmp .LBB32_3
-; X64-NEXT: .LBB32_1:
-; X64-NEXT: vmovdqa %xmm0, %xmm2
-; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: .LBB32_3:
-; X64-NEXT: vminss %xmm2, %xmm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X64-NEXT: vandps %xmm2, %xmm0, %xmm2
+; X64-NEXT: vminss %xmm1, %xmm0, %xmm1
+; X64-NEXT: vorps %xmm1, %xmm2, %xmm1
; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: fminimum_v4f32:
; X86: # %bb.0:
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB32_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB32_3
-; X86-NEXT: .LBB32_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB32_3:
; X86-NEXT: pushl %eax
-; X86-NEXT: vminss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; X86-NEXT: vandps %xmm2, %xmm0, %xmm2
+; X86-NEXT: vminss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vorps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -810,17 +766,9 @@ define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {
define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
; X64-LABEL: fminimum_v4f64:
; X64: # %bb.0:
-; X64-NEXT: vmovq %xmm0, %rax
-; X64-NEXT: testq %rax, %rax
-; X64-NEXT: js .LBB33_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: vmovdqa %xmm1, %xmm2
-; X64-NEXT: jmp .LBB33_3
-; X64-NEXT: .LBB33_1:
-; X64-NEXT: vmovdqa %xmm0, %xmm2
-; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: .LBB33_3:
-; X64-NEXT: vminsd %xmm2, %xmm0, %xmm1
+; X64-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1
; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X64-NEXT: vzeroupper
@@ -828,21 +776,13 @@ define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {
;
; X86-LABEL: fminimum_v4f64:
; X86: # %bb.0:
-; X86-NEXT: vextractps $1, %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovapd %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_3
-; X86-NEXT: .LBB33_1:
-; X86-NEXT: vmovapd %xmm0, %xmm2
-; X86-NEXT: vmovapd %xmm1, %xmm0
-; X86-NEXT: .LBB33_3:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp
-; X86-NEXT: vminsd %xmm2, %xmm0, %xmm1
+; X86-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovlpd %xmm0, (%esp)
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 06515e4f82687..910acd6d82ae2 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -22,56 +22,37 @@ declare <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat>, <4 x bfloat>)
define float @test_fmaximum(float %x, float %y) nounwind {
; SSE2-LABEL: test_fmaximum:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: js .LBB0_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB0_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordss %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB0_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB0_4:
-; SSE2-NEXT: maxss %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: maxss %xmm1, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximum:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB0_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: jmp .LBB0_3
-; AVX1-NEXT: .LBB0_1:
-; AVX1-NEXT: vmovdqa %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: .LBB0_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vandps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vandps %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximum:
@@ -82,19 +63,10 @@ define float @test_fmaximum(float %x, float %y) nounwind {
; X86-LABEL: test_fmaximum:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm2, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB0_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB0_3
-; X86-NEXT: .LBB0_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB0_3:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vandps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -183,45 +155,30 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: addss %xmm1, %xmm2
; SSE2-NEXT: subss %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB4_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: maxss %xmm2, %xmm0
-; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB4_1:
-; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: maxss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximum_nnan:
; AVX1: # %bb.0:
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB4_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB4_1:
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fmaximum_nnan:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm2, %xmm1
-; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX512F-NEXT: vorps %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fmaximum_nnan:
@@ -247,20 +204,12 @@ define float @test_fmaximum_nnan(float %x, float %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1
-; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB4_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm1, %xmm2
-; X86-NEXT: jmp .LBB4_3
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: vmovaps %xmm0, %xmm2
-; X86-NEXT: vmovaps %xmm1, %xmm0
-; X86-NEXT: .LBB4_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm1
+; X86-NEXT: vandps %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -459,58 +408,39 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
; SSE2-LABEL: test_fmaximum_combine_cmps:
; SSE2: # %bb.0:
; SSE2-NEXT: divss %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: js .LBB9_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: .LBB9_2:
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: cmpunordss %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB9_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: .LBB9_4:
-; SSE2-NEXT: maxss %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: maxss %xmm1, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximum_combine_cmps:
; AVX1: # %bb.0:
; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB9_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovaps %xmm0, %xmm2
-; AVX1-NEXT: jmp .LBB9_3
-; AVX1-NEXT: .LBB9_1:
-; AVX1-NEXT: vmovaps %xmm1, %xmm2
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: .LBB9_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vandps %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fmaximum_combine_cmps:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm0, %xmm2
-; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512F-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512F-NEXT: vandps %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: vmovaps %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fmaximum_combine_cmps:
@@ -533,20 +463,12 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
; X86-LABEL: test_fmaximum_combine_cmps:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB9_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm1, %xmm2
-; X86-NEXT: jmp .LBB9_3
-; X86-NEXT: .LBB9_1:
-; X86-NEXT: vmovaps %xmm0, %xmm2
-; X86-NEXT: vmovaps %xmm1, %xmm0
-; X86-NEXT: .LBB9_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1
+; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vandps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -565,54 +487,34 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
define float @test_fminimum(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimum:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: js .LBB10_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB10_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: cmpunordss %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB10_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: .LBB10_4:
-; SSE2-NEXT: minss %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: minss %xmm1, %xmm4
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimum:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB10_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm1, %xmm2
-; AVX1-NEXT: jmp .LBB10_3
-; AVX1-NEXT: .LBB10_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa %xmm1, %xmm0
-; AVX1-NEXT: .LBB10_3:
-; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fminimum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovaps %xmm1, %xmm2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vmovaps %xmm1, %xmm0
@@ -626,19 +528,10 @@ define float @test_fminimum(float %x, float %y) nounwind {
; X86-LABEL: test_fminimum:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB10_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB10_3
-; X86-NEXT: .LBB10_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB10_3:
-; X86-NEXT: vminss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vorps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -724,41 +617,24 @@ define float @test_fminimum_nan1(float %x, float %y) {
define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
; SSE2-LABEL: test_fminimum_nnan:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB14_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: minsd %xmm1, %xmm0
-; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB14_1:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: minsd %xmm2, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: minsd %xmm1, %xmm2
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimum_nnan:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB14_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB14_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vorpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fminimum_nnan:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: testq %rax, %rax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovapd %xmm1, %xmm2
-; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; AVX512F-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vorpd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fminimum_nnan:
@@ -782,20 +658,11 @@ define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true"
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp
-; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: vextractps $1, %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB14_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovapd %xmm1, %xmm2
-; X86-NEXT: jmp .LBB14_3
-; X86-NEXT: .LBB14_1:
-; X86-NEXT: vmovapd %xmm0, %xmm2
-; X86-NEXT: vmovapd %xmm1, %xmm0
-; X86-NEXT: .LBB14_3:
-; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0
-; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: vminsd 16(%ebp), %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vorpd %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -983,41 +850,25 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimum_combine_cmps:
; SSE2: # %bb.0:
; SSE2-NEXT: divss %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: js .LBB19_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: .LBB19_2:
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: cmpunordss %xmm3, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB19_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: .LBB19_4:
-; SSE2-NEXT: minss %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: orps %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: minss %xmm1, %xmm4
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimum_combine_cmps:
; AVX1: # %bb.0:
-; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB19_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovaps %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB19_3
-; AVX1-NEXT: .LBB19_1:
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: vmovaps %xmm2, %xmm0
-; AVX1-NEXT: .LBB19_3:
+; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
@@ -1025,14 +876,10 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
; AVX512F-LABEL: test_fminimum_combine_cmps:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm1, %xmm2
-; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT: vminss %xmm1, %xmm0, %xmm1
+; AVX512F-NEXT: vorps %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
; AVX512F-NEXT: vmovaps %xmm1, %xmm0
@@ -1060,18 +907,10 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB19_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm2, %xmm1
-; X86-NEXT: jmp .LBB19_3
-; X86-NEXT: .LBB19_1:
-; X86-NEXT: vmovaps %xmm0, %xmm1
-; X86-NEXT: vmovaps %xmm2, %xmm0
-; X86-NEXT: .LBB19_3:
+; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1
; X86-NEXT: vminss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vorps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
@@ -1086,32 +925,24 @@ define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
; SSE2-LABEL: test_fminimum_vector:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: minpd %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm3, %xmm0
-; SSE2-NEXT: andpd %xmm0, %xmm3
-; SSE2-NEXT: andnpd %xmm1, %xmm0
-; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: andpd %xmm2, %xmm3
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: minpd %xmm1, %xmm4
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: andnpd %xmm0, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimum_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vorpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -1123,9 +954,9 @@ define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
;
; X86-LABEL: test_fminimum_vector:
; X86: # %bb.0:
-; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
@@ -1250,36 +1081,28 @@ define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) {
define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
; SSE2-LABEL: test_fminimum_vector_different_zeros:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: orps %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: minpd %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm3, %xmm0
-; SSE2-NEXT: andpd %xmm0, %xmm3
-; SSE2-NEXT: andnpd %xmm1, %xmm0
-; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm1
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm1, %xmm2
+; SSE2-NEXT: xorpd %xmm3, %xmm3
+; SSE2-NEXT: movhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: minpd %xmm3, %xmm4
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: andnpd %xmm0, %xmm1
+; SSE2-NEXT: orpd %xmm2, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimum_vector_different_zeros:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vorpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -1295,9 +1118,9 @@ define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1
+; X86-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
@@ -1506,31 +1329,25 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
; SSE2-LABEL: test_fmaximum_v4f32_splat:
; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm2, %xmm4
-; SSE2-NEXT: orps %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: maxps %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: cmpunordps %xmm0, %xmm2
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: maxps %xmm1, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximum_v4f32_splat:
; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vandps %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
@@ -1538,9 +1355,10 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
; AVX512-LABEL: test_fmaximum_v4f32_splat:
; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vandps %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
@@ -1554,9 +1372,9 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
; X86-LABEL: test_fmaximum_v4f32_splat:
; X86: # %bb.0:
; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
+; X86-NEXT: vandps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
@@ -1569,134 +1387,108 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; SSE2-LABEL: test_fmaximum_v4f16:
; SSE2: # %bb.0:
-; SSE2-NEXT: subq $104, %rsp
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: subq $168, %rsp
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: js .LBB33_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: .LBB33_2:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm3
; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andps %xmm2, %xmm3
-; SSE2-NEXT: js .LBB33_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: .LBB33_4:
-; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: maxss %xmm4, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm0
-; SSE2-NEXT: orps %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: js .LBB33_6
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: .LBB33_6:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm3
; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andps %xmm2, %xmm3
-; SSE2-NEXT: js .LBB33_8
-; SSE2-NEXT: # %bb.7:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: .LBB33_8:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm1
-; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm1
-; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT: maxss %xmm4, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm0
-; SSE2-NEXT: orps %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: js .LBB33_10
-; SSE2-NEXT: # %bb.9:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: .LBB33_10:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: cmpunordss %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: andps %xmm2, %xmm3
-; SSE2-NEXT: js .LBB33_12
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: .LBB33_12:
-; SSE2-NEXT: maxss %xmm4, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: orps %xmm3, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: js .LBB33_14
-; SSE2-NEXT: # %bb.13:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: .LBB33_14:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm3
; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andps %xmm2, %xmm3
-; SSE2-NEXT: js .LBB33_16
-; SSE2-NEXT: # %bb.15:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: .LBB33_16:
-; SSE2-NEXT: maxss %xmm4, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm0
-; SSE2-NEXT: orps %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1704,65 +1496,49 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: addq $104, %rsp
+; SSE2-NEXT: addq $168, %rsp
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximum_v4f16:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $120, %rsp
-; AVX1-NEXT: vmovaps %xmm0, %xmm2
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: subq $152, %rsp
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0
-; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_3
-; AVX1-NEXT: .LBB33_1:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_3:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1
; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_4
-; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_6
-; AVX1-NEXT: .LBB33_4:
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_6:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vandps (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1
; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
@@ -1770,51 +1546,37 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_7
-; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_9
-; AVX1-NEXT: .LBB33_7:
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_9:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1
; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
-; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_10
-; AVX1-NEXT: # %bb.11:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_12
-; AVX1-NEXT: .LBB33_10:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_12:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1
; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX1-NEXT: addq $120, %rsp
+; AVX1-NEXT: addq $152, %rsp
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximum_v4f16:
@@ -2022,19 +1784,10 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_3
-; X86-NEXT: .LBB33_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -2044,19 +1797,10 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_6
-; X86-NEXT: .LBB33_4:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_6:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -2082,19 +1826,10 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_7
-; X86-NEXT: # %bb.8:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_9
-; X86-NEXT: .LBB33_7:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_9:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -2104,19 +1839,10 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_10
-; X86-NEXT: # %bb.11:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_12
-; X86-NEXT: .LBB33_10:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_12:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
+; X86-NEXT: vandps %xmm2, %xmm1, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
@@ -2146,126 +1872,100 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; SSE2-NEXT: .cfi_def_cfa_offset 24
; SSE2-NEXT: pushq %r14
; SSE2-NEXT: .cfi_def_cfa_offset 32
-; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %r13
; SSE2-NEXT: .cfi_def_cfa_offset 40
+; SSE2-NEXT: pushq %r12
+; SSE2-NEXT: .cfi_def_cfa_offset 48
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: .cfi_def_cfa_offset 56
; SSE2-NEXT: subq $56, %rsp
-; SSE2-NEXT: .cfi_def_cfa_offset 96
-; SSE2-NEXT: .cfi_offset %rbx, -40
+; SSE2-NEXT: .cfi_def_cfa_offset 112
+; SSE2-NEXT: .cfi_offset %rbx, -56
+; SSE2-NEXT: .cfi_offset %r12, -48
+; SSE2-NEXT: .cfi_offset %r13, -40
; SSE2-NEXT: .cfi_offset %r14, -32
; SSE2-NEXT: .cfi_offset %r15, -24
; SSE2-NEXT: .cfi_offset %rbp, -16
-; SSE2-NEXT: pextrw $0, %xmm1, %r14d
-; SSE2-NEXT: pextrw $0, %xmm0, %r15d
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrld $16, %xmm2
-; SSE2-NEXT: pextrw $0, %xmm2, %eax
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: pextrw $0, %xmm2, %ebx
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $16, %xmm2
-; SSE2-NEXT: pextrw $0, %xmm2, %ecx
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: pextrw $0, %xmm2, %ebp
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSE2-NEXT: pextrw $0, %xmm2, %r14d
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT: pextrw $0, %xmm2, %r15d
+; SSE2-NEXT: pextrw $0, %xmm1, %r12d
+; SSE2-NEXT: pextrw $0, %xmm0, %r13d
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pextrw $0, %xmm1, %eax
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pextrw $0, %xmm0, %ecx
; SSE2-NEXT: shll $16, %ecx
-; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: cmpunordss %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: testl %ecx, %ecx
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: js .LBB34_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: .LBB34_2:
-; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
-; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: cmpunordss %xmm7, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm7, %xmm4
-; SSE2-NEXT: js .LBB34_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: .LBB34_4:
-; SSE2-NEXT: pextrw $0, %xmm5, %ebp
-; SSE2-NEXT: pextrw $0, %xmm6, %ebx
-; SSE2-NEXT: maxss %xmm2, %xmm7
-; SSE2-NEXT: andnps %xmm7, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: shll $16, %r15d
-; SSE2-NEXT: movd %r15d, %xmm3
-; SSE2-NEXT: shll $16, %r14d
-; SSE2-NEXT: movd %r14d, %xmm2
-; SSE2-NEXT: testl %r15d, %r15d
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: js .LBB34_6
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB34_6:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm5
-; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm6
+; SSE2-NEXT: shll $16, %r13d
+; SSE2-NEXT: movd %r13d, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: cmpunordss %xmm1, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm1, %xmm4
-; SSE2-NEXT: js .LBB34_8
-; SSE2-NEXT: # %bb.7:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: .LBB34_8:
-; SSE2-NEXT: pextrw $0, %xmm5, %r15d
-; SSE2-NEXT: pextrw $0, %xmm6, %r14d
-; SSE2-NEXT: maxss %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: shll $16, %r12d
+; SSE2-NEXT: movd %r12d, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
; SSE2-NEXT: andnps %xmm1, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: shll $16, %ebx
-; SSE2-NEXT: movd %ebx, %xmm1
-; SSE2-NEXT: shll $16, %ebp
-; SSE2-NEXT: movd %ebp, %xmm3
-; SSE2-NEXT: testl %ebx, %ebx
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: js .LBB34_10
-; SSE2-NEXT: # %bb.9:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: .LBB34_10:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm2, %xmm4
-; SSE2-NEXT: js .LBB34_12
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB34_12:
-; SSE2-NEXT: maxss %xmm3, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: shll $16, %r15d
+; SSE2-NEXT: movd %r15d, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: cmpunordss %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: shll $16, %r14d
+; SSE2-NEXT: movd %r14d, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: shll $16, %r14d
-; SSE2-NEXT: movd %r14d, %xmm1
-; SSE2-NEXT: shll $16, %r15d
-; SSE2-NEXT: movd %r15d, %xmm3
-; SSE2-NEXT: testl %r14d, %r14d
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: js .LBB34_14
-; SSE2-NEXT: # %bb.13:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: .LBB34_14:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm2, %xmm4
-; SSE2-NEXT: js .LBB34_16
-; SSE2-NEXT: # %bb.15:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB34_16:
-; SSE2-NEXT: maxss %xmm3, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: shll $16, %ebp
+; SSE2-NEXT: movd %ebp, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: cmpunordss %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: shll $16, %ebx
+; SSE2-NEXT: movd %ebx, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm4
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2274,8 +1974,12 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: addq $56, %rsp
-; SSE2-NEXT: .cfi_def_cfa_offset 40
+; SSE2-NEXT: .cfi_def_cfa_offset 56
; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: .cfi_def_cfa_offset 48
+; SSE2-NEXT: popq %r12
+; SSE2-NEXT: .cfi_def_cfa_offset 40
+; SSE2-NEXT: popq %r13
; SSE2-NEXT: .cfi_def_cfa_offset 32
; SSE2-NEXT: popq %r14
; SSE2-NEXT: .cfi_def_cfa_offset 24
@@ -2308,11 +2012,13 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; AVX1-NEXT: .cfi_offset %r15, -24
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpextrw $0, %xmm4, %ebx
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpextrw $0, %xmm4, %r14d
+; AVX1-NEXT: vpextrw $0, %xmm2, %ebx
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm2
+; AVX1-NEXT: vpextrw $0, %xmm2, %ebp
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpextrw $0, %xmm2, %r14d
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpextrw $0, %xmm2, %r15d
; AVX1-NEXT: vpextrw $0, %xmm0, %r12d
; AVX1-NEXT: vpextrw $0, %xmm1, %r13d
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -2322,71 +2028,45 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: js .LBB34_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm4, %xmm1
-; AVX1-NEXT: jmp .LBB34_3
-; AVX1-NEXT: .LBB34_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm4, %xmm0
-; AVX1-NEXT: .LBB34_3:
-; AVX1-NEXT: vpextrw $0, %xmm2, %ebp
-; AVX1-NEXT: vpextrw $0, %xmm3, %r15d
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: shll $16, %r13d
; AVX1-NEXT: vmovd %r13d, %xmm0
; AVX1-NEXT: shll $16, %r12d
-; AVX1-NEXT: vmovd %r12d, %xmm2
-; AVX1-NEXT: js .LBB34_4
-; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_6
-; AVX1-NEXT: .LBB34_4:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_6:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %r12d, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: shll $16, %r15d
+; AVX1-NEXT: vmovd %r15d, %xmm0
; AVX1-NEXT: shll $16, %r14d
-; AVX1-NEXT: vmovd %r14d, %xmm0
-; AVX1-NEXT: shll $16, %ebx
-; AVX1-NEXT: vmovd %ebx, %xmm2
-; AVX1-NEXT: js .LBB34_7
-; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_9
-; AVX1-NEXT: .LBB34_7:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_9:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %r14d, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: shll $16, %r15d
-; AVX1-NEXT: vmovd %r15d, %xmm0
; AVX1-NEXT: shll $16, %ebp
-; AVX1-NEXT: vmovd %ebp, %xmm2
-; AVX1-NEXT: js .LBB34_10
-; AVX1-NEXT: # %bb.11:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_12
-; AVX1-NEXT: .LBB34_10:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_12:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %ebp, %xmm0
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovd %ebx, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
+; AVX1-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2444,58 +2124,50 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; AVX512-NEXT: shrq $48, %r12
; AVX512-NEXT: movl %ebp, %eax
; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
-; AVX512-NEXT: sets %cl
-; AVX512-NEXT: kmovw %ecx, %k1
-; AVX512-NEXT: movl %r13d, %ecx
-; AVX512-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
-; AVX512-NEXT: vmovd %ecx, %xmm1
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: movl %r13d, %eax
+; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: shll $16, %ebp
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %ebp, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %r13d
-; AVX512-NEXT: vmovd %r13d, %xmm1
-; AVX512-NEXT: vmovd %ebp, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %r13d, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp)
; AVX512-NEXT: shll $16, %r12d
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %r12d, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %r15d
-; AVX512-NEXT: vmovd %r15d, %xmm1
-; AVX512-NEXT: vmovd %r12d, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %r15d, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: shll $16, %r14d
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %r14d, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %ebx
-; AVX512-NEXT: vmovd %ebx, %xmm1
-; AVX512-NEXT: vmovd %r14d, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %ebx, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
@@ -2539,90 +2211,67 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: vpsrlq $48, %xmm0, %xmm2
-; X86-NEXT: vpsrlq $48, %xmm1, %xmm3
-; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; X86-NEXT: vpextrw $0, %xmm4, %esi
-; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; X86-NEXT: vpextrw $0, %xmm4, %ebx
+; X86-NEXT: vpextrw $0, %xmm2, %esi
+; X86-NEXT: vpsrlq $48, %xmm1, %xmm2
+; X86-NEXT: vpextrw $0, %xmm2, %edi
+; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: vpextrw $0, %xmm2, %ebx
+; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X86-NEXT: vpextrw $0, %xmm2, %ebp
; X86-NEXT: vpextrw $0, %xmm0, %eax
-; X86-NEXT: vpextrw $0, %xmm1, %ecx
; X86-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-NEXT: vpextrw $0, %xmm0, %edx
-; X86-NEXT: vpsrld $16, %xmm1, %xmm0
-; X86-NEXT: vpextrw $0, %xmm0, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm0
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: vmovd %edx, %xmm4
-; X86-NEXT: js .LBB34_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm4, %xmm1
-; X86-NEXT: jmp .LBB34_3
-; X86-NEXT: .LBB34_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm4, %xmm0
-; X86-NEXT: .LBB34_3:
-; X86-NEXT: vpextrw $0, %xmm2, %edi
-; X86-NEXT: vpextrw $0, %xmm3, %ebp
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vpsrld $16, %xmm1, %xmm2
+; X86-NEXT: vpextrw $0, %xmm2, %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm2
+; X86-NEXT: vpextrw $0, %xmm0, %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm0
+; X86-NEXT: vpextrw $0, %xmm1, %ecx
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
+; X86-NEXT: vorps %xmm3, %xmm0, %xmm2
+; X86-NEXT: vandps %xmm1, %xmm2, %xmm1
; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: shll $16, %ecx
; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm2
-; X86-NEXT: js .LBB34_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_6
-; X86-NEXT: .LBB34_4:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_6:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vorps %xmm3, %xmm1, %xmm2
+; X86-NEXT: vandps %xmm0, %xmm2, %xmm0
+; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: shll $16, %ebp
+; X86-NEXT: vmovd %ebp, %xmm0
; X86-NEXT: shll $16, %ebx
-; X86-NEXT: vmovd %ebx, %xmm0
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: vmovd %esi, %xmm2
-; X86-NEXT: js .LBB34_7
-; X86-NEXT: # %bb.8:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_9
-; X86-NEXT: .LBB34_7:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_9:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %ebx, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm2
+; X86-NEXT: vandps %xmm0, %xmm2, %xmm0
+; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: shll $16, %ebp
-; X86-NEXT: vmovd %ebp, %xmm0
; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm2
-; X86-NEXT: js .LBB34_10
-; X86-NEXT: # %bb.11:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_12
-; X86-NEXT: .LBB34_10:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_12:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %edi, %xmm0
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: vmovd %esi, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm2
+; X86-NEXT: vandps %xmm0, %xmm2, %xmm0
+; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index aae6cda4458d2..82aa61211876d 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -22,55 +22,36 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
define float @test_fmaximumnum(float %x, float %y) nounwind {
; SSE2-LABEL: test_fmaximumnum:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB0_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: jmp .LBB0_3
-; SSE2-NEXT: .LBB0_1:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: .LBB0_3:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: maxss %xmm2, %xmm3
-; SSE2-NEXT: movaps %xmm3, %xmm0
-; SSE2-NEXT: cmpunordss %xmm3, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: cmpunordss %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm1
+; SSE2-NEXT: orps %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB0_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: jmp .LBB0_3
-; AVX1-NEXT: .LBB0_1:
-; AVX1-NEXT: vmovdqa %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: .LBB0_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm3
+; AVX512-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum:
@@ -81,21 +62,13 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
; X86-LABEL: test_fmaximumnum:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm2, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB0_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB0_3
-; X86-NEXT: .LBB0_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB0_3:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
+; X86-NEXT: vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -179,45 +152,30 @@ define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: addss %xmm1, %xmm2
; SSE2-NEXT: subss %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB4_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: maxss %xmm2, %xmm0
-; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB4_1:
-; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: maxss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_nnan:
; AVX1: # %bb.0:
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB4_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB4_1:
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; AVX1-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fmaximumnum_nnan:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm2, %xmm1
-; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX512F-NEXT: vorps %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vandps %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fmaximumnum_nnan:
@@ -243,20 +201,12 @@ define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1
-; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB4_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm1, %xmm2
-; X86-NEXT: jmp .LBB4_3
-; X86-NEXT: .LBB4_1:
-; X86-NEXT: vmovaps %xmm0, %xmm2
-; X86-NEXT: vmovaps %xmm1, %xmm0
-; X86-NEXT: .LBB4_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vaddss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vsubss %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmaxss %xmm0, %xmm2, %xmm0
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2, %xmm1
+; X86-NEXT: vandps %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -369,28 +319,29 @@ define double @test_fmaximumnum_zero2(double %x, double %y) {
define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
; SSE2-LABEL: test_fmaximumnum_nsz:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm1
-; SSE2-NEXT: cmpunordss %xmm2, %xmm1
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: cmpunordss %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: maxss %xmm1, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum_nsz:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_nsz:
@@ -402,9 +353,10 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -417,57 +369,27 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; SSE2-LABEL: test_fmaximumnum_combine_cmps:
; SSE2: # %bb.0:
; SSE2-NEXT: divss %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB9_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: jmp .LBB9_3
-; SSE2-NEXT: .LBB9_1:
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: .LBB9_3:
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: maxss %xmm2, %xmm3
-; SSE2-NEXT: movaps %xmm3, %xmm0
-; SSE2-NEXT: cmpunordss %xmm3, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: orps %xmm2, %xmm0
+; SSE2-NEXT: maxss %xmm1, %xmm2
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_combine_cmps:
; AVX1: # %bb.0:
; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB9_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovaps %xmm0, %xmm2
-; AVX1-NEXT: jmp .LBB9_3
-; AVX1-NEXT: .LBB9_1:
-; AVX1-NEXT: vmovaps %xmm1, %xmm2
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: .LBB9_3:
-; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm0, %xmm2
-; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512F-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vandps %xmm0, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
@@ -490,22 +412,12 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
; X86-LABEL: test_fmaximumnum_combine_cmps:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB9_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm1, %xmm2
-; X86-NEXT: jmp .LBB9_3
-; X86-NEXT: .LBB9_1:
-; X86-NEXT: vmovaps %xmm0, %xmm2
-; X86-NEXT: vmovaps %xmm1, %xmm0
-; X86-NEXT: .LBB9_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1
+; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vandps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -522,57 +434,36 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
define float @test_fminimumnum(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimumnum:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB10_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: jmp .LBB10_3
-; SSE2-NEXT: .LBB10_1:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: .LBB10_3:
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: minss %xmm2, %xmm3
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: cmpunordss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: minss %xmm1, %xmm3
+; SSE2-NEXT: cmpunordss %xmm1, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: andps %xmm0, %xmm1
-; SSE2-NEXT: orps %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm0, %xmm2
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm3, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm1
+; SSE2-NEXT: orps %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB10_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm1, %xmm2
-; AVX1-NEXT: jmp .LBB10_3
-; AVX1-NEXT: .LBB10_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa %xmm1, %xmm0
-; AVX1-NEXT: .LBB10_3:
-; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT: vorps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fminimumnum:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovaps %xmm1, %xmm2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm3
+; AVX512-NEXT: vorps %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum:
@@ -583,21 +474,13 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
; X86-LABEL: test_fminimumnum:
; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB10_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB10_3
-; X86-NEXT: .LBB10_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB10_3:
-; X86-NEXT: vminss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vminss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
+; X86-NEXT: vorps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -678,41 +561,24 @@ define float @test_fminimumnum_nan1(float %x, float %y) {
define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
; SSE2-LABEL: test_fminimumnum_nnan:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB14_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: minsd %xmm1, %xmm0
-; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB14_1:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: minsd %xmm2, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: minsd %xmm1, %xmm2
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum_nnan:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB14_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB14_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vorpd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fminimumnum_nnan:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: testq %rax, %rax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovapd %xmm1, %xmm2
-; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vminsd %xmm1, %xmm0, %xmm1
+; AVX512F-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vorpd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fminimumnum_nnan:
@@ -736,20 +602,11 @@ define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="tr
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp
-; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: vextractps $1, %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB14_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovapd %xmm1, %xmm2
-; X86-NEXT: jmp .LBB14_3
-; X86-NEXT: .LBB14_1:
-; X86-NEXT: vmovapd %xmm0, %xmm2
-; X86-NEXT: vmovapd %xmm1, %xmm0
-; X86-NEXT: .LBB14_3:
-; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0
-; X86-NEXT: vmovsd %xmm0, (%esp)
+; X86-NEXT: vminsd 16(%ebp), %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vorpd %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmovlpd %xmm0, (%esp)
; X86-NEXT: fldl (%esp)
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -853,28 +710,29 @@ define double @test_fminimumnum_zero2(double %x, double %y) {
define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimumnum_nsz:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: minss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm1
-; SSE2-NEXT: cmpunordss %xmm2, %xmm1
-; SSE2-NEXT: andps %xmm1, %xmm0
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: cmpunordss %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: minss %xmm1, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum_nsz:
; AVX1: # %bb.0:
-; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fminimumnum_nsz:
; AVX512: # %bb.0:
-; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm2
; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_nsz:
@@ -886,9 +744,10 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: vminss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -901,59 +760,27 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
; SSE2-LABEL: test_fminimumnum_combine_cmps:
; SSE2: # %bb.0:
; SSE2-NEXT: divss %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB19_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: jmp .LBB19_3
-; SSE2-NEXT: .LBB19_1:
; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: .LBB19_3:
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: minss %xmm2, %xmm3
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: cmpunordss %xmm3, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: andnps %xmm3, %xmm2
-; SSE2-NEXT: andps %xmm0, %xmm1
-; SSE2-NEXT: orps %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: minss %xmm1, %xmm2
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fminimumnum_combine_cmps:
; AVX1: # %bb.0:
-; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB19_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovaps %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB19_3
-; AVX1-NEXT: .LBB19_1:
-; AVX1-NEXT: vmovaps %xmm0, %xmm1
-; AVX1-NEXT: vmovaps %xmm2, %xmm0
-; AVX1-NEXT: .LBB19_3:
+; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512F-LABEL: test_fminimumnum_combine_cmps:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: testl %eax, %eax
-; AVX512F-NEXT: sets %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovaps %xmm1, %xmm2
-; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1
-; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512F-NEXT: vmovaps %xmm1, %xmm0
+; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm2
+; AVX512F-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vorps %xmm0, %xmm2, %xmm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_fminimumnum_combine_cmps:
@@ -978,20 +805,10 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
; X86-NEXT: pushl %eax
; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2
-; X86-NEXT: vmovd %xmm0, %eax
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB19_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovaps %xmm2, %xmm1
-; X86-NEXT: jmp .LBB19_3
-; X86-NEXT: .LBB19_1:
-; X86-NEXT: vmovaps %xmm0, %xmm1
-; X86-NEXT: vmovaps %xmm2, %xmm0
-; X86-NEXT: .LBB19_3:
+; X86-NEXT: vdivss %xmm0, %xmm1, %xmm1
; X86-NEXT: vminss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vorps %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax
@@ -1004,34 +821,25 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
; SSE2-LABEL: test_fminimumnum_vector:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: minpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: minpd %xmm1, %xmm3
+; SSE2-NEXT: cmpunordpd %xmm1, %xmm1
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm1, %xmm2
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: andnpd %xmm0, %xmm1
+; SSE2-NEXT: orpd %xmm1, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector:
; AVX: # %bb.0:
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX-NEXT: vorpd %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector:
@@ -1041,11 +849,11 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
;
; X86-LABEL: test_fminimumnum_vector:
; X86: # %bb.0:
-; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vminpd %xmm1, %xmm0, %xmm2
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm3
+; X86-NEXT: vorpd %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
ret <2 x double> %r
@@ -1080,17 +888,16 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: minpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm0
; SSE2-NEXT: andnpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector_zero:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vandnpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_zero:
@@ -1102,9 +909,9 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
; X86-LABEL: test_fminimumnum_vector_zero:
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1
-; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vandnpd %xmm1, %xmm0, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
ret <2 x double> %r
@@ -1116,8 +923,7 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: maxps %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordps %xmm2, %xmm0
+; SSE2-NEXT: cmpunordps %xmm0, %xmm0
; SSE2-NEXT: andps %xmm0, %xmm1
; SSE2-NEXT: andnps %xmm2, %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
@@ -1126,9 +932,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
@@ -1139,9 +945,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
; X86-LABEL: test_fmaximumnum_vector_signed_zero:
; X86: # %bb.0:
; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
ret <4 x float> %r
@@ -1153,21 +959,20 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movapd %xmm1, %xmm2
-; SSE2-NEXT: minpd %xmm0, %xmm2
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm2, %xmm0
-; SSE2-NEXT: andpd %xmm0, %xmm1
-; SSE2-NEXT: andnpd %xmm2, %xmm0
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: minpd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm0
+; SSE2-NEXT: andpd %xmm0, %xmm2
+; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector_partially_zero:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
@@ -1181,9 +986,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vminpd %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
ret <2 x double> %r
@@ -1192,38 +997,21 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
; SSE2-LABEL: test_fminimumnum_vector_different_zeros:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: orps %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: minpd %xmm4, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: andnpd %xmm1, %xmm0
-; SSE2-NEXT: orpd %xmm3, %xmm0
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: minpd %xmm1, %xmm2
+; SSE2-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector_different_zeros:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros:
@@ -1237,11 +1025,9 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
-; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vminpd %xmm1, %xmm0, %xmm1
+; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vorpd %xmm1, %xmm0, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
ret <2 x double> %r
@@ -1277,21 +1063,20 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; SSE2-NEXT: movapd %xmm1, %xmm2
-; SSE2-NEXT: minpd %xmm0, %xmm2
-; SSE2-NEXT: movapd %xmm2, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm2, %xmm0
-; SSE2-NEXT: andpd %xmm0, %xmm1
-; SSE2-NEXT: andnpd %xmm2, %xmm0
-; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: minpd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm0
+; SSE2-NEXT: andpd %xmm0, %xmm2
+; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector_nan:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_nan:
@@ -1305,9 +1090,9 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vminpd %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
ret <2 x double> %r
@@ -1318,17 +1103,16 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: minpd %xmm0, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
-; SSE2-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm0
; SSE2-NEXT: andnpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test_fminimumnum_vector_zero_first:
; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vandnpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vandnpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
@@ -1340,9 +1124,9 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
; X86-LABEL: test_fminimumnum_vector_zero_first:
; X86: # %bb.0:
; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1
-; X86-NEXT: vandnpd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vminpd %xmm0, %xmm1, %xmm1
+; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0
+; X86-NEXT: vandnpd %xmm1, %xmm0, %xmm0
; X86-NEXT: retl
%r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
ret <2 x double> %r
@@ -1378,8 +1162,7 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; SSE2-NEXT: movaps %xmm1, %xmm2
; SSE2-NEXT: maxps %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordps %xmm2, %xmm0
+; SSE2-NEXT: cmpunordps %xmm0, %xmm0
; SSE2-NEXT: andps %xmm0, %xmm1
; SSE2-NEXT: andnps %xmm2, %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
@@ -1388,9 +1171,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
@@ -1401,9 +1184,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
; X86: # %bb.0:
; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0
-; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2
+; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: retl
%r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
ret <4 x float> %r
@@ -1442,43 +1225,37 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
; SSE2-LABEL: test_fmaximumnum_v4f32_splat:
; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm2, %xmm4
-; SSE2-NEXT: orps %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: maxps %xmm4, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm2
-; SSE2-NEXT: cmpunordps %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: maxps %xmm1, %xmm3
+; SSE2-NEXT: cmpunordps %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm1
+; SSE2-NEXT: orps %xmm1, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmaxps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum_v4f32_splat:
; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastss %xmm1, %xmm1
-; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2
-; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vorps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm3
+; AVX512-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vcmpunordps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat:
@@ -1490,11 +1267,11 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
; X86-LABEL: test_fmaximumnum_v4f32_splat:
; X86: # %bb.0:
; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1
-; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordps %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm2
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; X86-NEXT: retl
%splatinsert = insertelement <4 x float> poison, float %y, i64 0
%vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
@@ -1505,130 +1282,100 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
; SSE2-LABEL: test_fmaximumnum_v4f16:
; SSE2: # %bb.0:
-; SSE2-NEXT: subq $136, %rsp
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
-; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: subq $168, %rsp
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: psrld $16, %xmm0
; SSE2-NEXT: callq __extendhfsf2 at PLT
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB33_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT: jmp .LBB33_3
-; SSE2-NEXT: .LBB33_1:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB33_3:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: psrlq $48, %xmm0
-; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
; SSE2-NEXT: andps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
-; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB33_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT: jmp .LBB33_6
-; SSE2-NEXT: .LBB33_4:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB33_6:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
; SSE2-NEXT: andps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB33_7
-; SSE2-NEXT: # %bb.8:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE2-NEXT: jmp .LBB33_9
-; SSE2-NEXT: .LBB33_7:
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB33_9:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
; SSE2-NEXT: andps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: js .LBB33_10
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT: jmp .LBB33_12
-; SSE2-NEXT: .LBB33_10:
-; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: .LBB33_12:
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: maxss %xmm1, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE2-NEXT: orps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: callq __extendhfsf2 at PLT
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm0
; SSE2-NEXT: callq __truncsfhf2 at PLT
; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1636,246 +1383,177 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: addq $136, %rsp
+; SSE2-NEXT: addq $168, %rsp
; SSE2-NEXT: retq
;
; AVX1-LABEL: test_fmaximumnum_v4f16:
; AVX1: # %bb.0:
-; AVX1-NEXT: subq $120, %rsp
-; AVX1-NEXT: vmovaps %xmm0, %xmm2
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: subq $152, %rsp
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpsrlq $48, %xmm2, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0
-; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_3
-; AVX1-NEXT: .LBB33_1:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_3:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __extendhfsf2 at PLT
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_4
-; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_6
-; AVX1-NEXT: .LBB33_4:
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_6:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vandps (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_7
-; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_9
-; AVX1-NEXT: .LBB33_7:
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_9:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
-; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __extendhfsf2 at PLT
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: testl %eax, %eax
-; AVX1-NEXT: js .LBB33_10
-; AVX1-NEXT: # %bb.11:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX1-NEXT: jmp .LBB33_12
-; AVX1-NEXT: .LBB33_10:
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vmovdqa %xmm0, %xmm2
-; AVX1-NEXT: .LBB33_12:
-; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1
-; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vmaxss %xmm0, %xmm2, %xmm1
+; AVX1-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: callq __truncsfhf2 at PLT
-; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX1-NEXT: addq $120, %rsp
+; AVX1-NEXT: addq $152, %rsp
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa %xmm2, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm3
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vorps %xmm2, %xmm3, %xmm4
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: vmaxss %xmm5, %xmm3, %xmm6
+; AVX512-NEXT: vandps %xmm6, %xmm4, %xmm4
+; AVX512-NEXT: vcmpunordss %xmm5, %xmm5, %k1
; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm2
-; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm3
+; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3]
; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa %xmm3, %xmm5
+; AVX512-NEXT: vorps %xmm2, %xmm4, %xmm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmaxss %xmm6, %xmm4, %xmm7
+; AVX512-NEXT: vandps %xmm7, %xmm5, %xmm5
+; AVX512-NEXT: vcmpunordss %xmm6, %xmm6, %k1
; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1
-; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm4
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa %xmm3, %xmm5
+; AVX512-NEXT: vorps %xmm2, %xmm4, %xmm5
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmaxss %xmm6, %xmm4, %xmm7
+; AVX512-NEXT: vandps %xmm7, %xmm5, %xmm5
+; AVX512-NEXT: vcmpunordss %xmm6, %xmm6, %k1
; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1
-; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovd %xmm4, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm4
+; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vmovdqa %xmm4, %xmm6
+; AVX512-NEXT: vorps %xmm2, %xmm5, %xmm6
+; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm1[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vmaxss %xmm7, %xmm5, %xmm8
+; AVX512-NEXT: vandps %xmm6, %xmm8, %xmm6
+; AVX512-NEXT: vcmpunordss %xmm7, %xmm7, %k1
; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1}
-; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm3
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmovd %xmm3, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm4
+; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm4
; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa %xmm3, %xmm5
+; AVX512-NEXT: vorps %xmm2, %xmm4, %xmm5
+; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmaxss %xmm6, %xmm4, %xmm7
+; AVX512-NEXT: vandps %xmm7, %xmm5, %xmm5
+; AVX512-NEXT: vcmpunordss %xmm6, %xmm6, %k1
; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1
-; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512-NEXT: vmovd %xmm4, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vmovdqa %xmm4, %xmm6
+; AVX512-NEXT: vorps %xmm2, %xmm5, %xmm6
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vmaxss %xmm7, %xmm5, %xmm8
+; AVX512-NEXT: vandps %xmm6, %xmm8, %xmm6
+; AVX512-NEXT: vcmpunordss %xmm7, %xmm7, %k1
; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1}
-; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm4
-; AVX512-NEXT: vmovd %xmm4, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5
-; AVX512-NEXT: vmovdqa %xmm4, %xmm6
+; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm5
+; AVX512-NEXT: vorps %xmm2, %xmm5, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm7
+; AVX512-NEXT: vmaxss %xmm7, %xmm5, %xmm8
+; AVX512-NEXT: vandps %xmm6, %xmm8, %xmm6
+; AVX512-NEXT: vcmpunordss %xmm7, %xmm7, %k1
; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1}
-; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT: vcmpunordss %xmm4, %xmm4, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm5
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vorps %xmm2, %xmm0, %xmm2
; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa %xmm0, %xmm5
-; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm5, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm6
+; AVX512-NEXT: vandps %xmm6, %xmm2, %xmm2
+; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
+; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX512-NEXT: retq
;
; AVX10_2-LABEL: test_fmaximumnum_v4f16:
@@ -1915,20 +1593,12 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_3
-; X86-NEXT: .LBB33_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_3:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm3, %xmm1, %xmm1
+; X86-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __extendhfsf2
@@ -1937,20 +1607,12 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_6
-; X86-NEXT: .LBB33_4:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_6:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm3, %xmm1, %xmm1
+; X86-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfhf2
@@ -1975,20 +1637,12 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_7
-; X86-NEXT: # %bb.8:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_9
-; X86-NEXT: .LBB33_7:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_9:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm3, %xmm1, %xmm1
+; X86-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __extendhfsf2
@@ -1997,20 +1651,12 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; X86-NEXT: fstps {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT: vmovd %xmm1, %eax
-; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: js .LBB33_10
-; X86-NEXT: # %bb.11:
-; X86-NEXT: vmovdqa %xmm1, %xmm2
-; X86-NEXT: jmp .LBB33_12
-; X86-NEXT: .LBB33_10:
-; X86-NEXT: vmovdqa %xmm0, %xmm2
-; X86-NEXT: vmovdqa %xmm1, %xmm0
-; X86-NEXT: .LBB33_12:
-; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm3, %xmm1, %xmm1
+; X86-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfhf2
@@ -2040,109 +1686,81 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
; SSE2-NEXT: subq $56, %rsp
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlq $48, %xmm2
+; SSE2-NEXT: pextrw $0, %xmm2, %ebx
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrlq $48, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrlq $48, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
-; SSE2-NEXT: pextrw $0, %xmm4, %ebp
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
-; SSE2-NEXT: pextrw $0, %xmm4, %r15d
-; SSE2-NEXT: pextrw $0, %xmm0, %r12d
-; SSE2-NEXT: pextrw $0, %xmm1, %r13d
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
+; SSE2-NEXT: pextrw $0, %xmm2, %ebp
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSE2-NEXT: pextrw $0, %xmm2, %r14d
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT: pextrw $0, %xmm2, %r15d
+; SSE2-NEXT: pextrw $0, %xmm1, %r12d
+; SSE2-NEXT: pextrw $0, %xmm0, %r13d
; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pextrw $0, %xmm1, %ecx
+; SSE2-NEXT: pextrw $0, %xmm1, %eax
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: pextrw $0, %xmm0, %ecx
; SSE2-NEXT: shll $16, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: movd %eax, %xmm4
-; SSE2-NEXT: js .LBB34_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: jmp .LBB34_3
-; SSE2-NEXT: .LBB34_1:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: .LBB34_3:
-; SSE2-NEXT: pextrw $0, %xmm2, %ebx
-; SSE2-NEXT: pextrw $0, %xmm3, %r14d
+; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
; SSE2-NEXT: orps %xmm3, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: shll $16, %r13d
; SSE2-NEXT: movd %r13d, %xmm1
; SSE2-NEXT: shll $16, %r12d
-; SSE2-NEXT: movd %r12d, %xmm2
-; SSE2-NEXT: js .LBB34_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: jmp .LBB34_6
-; SSE2-NEXT: .LBB34_4:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB34_6:
+; SSE2-NEXT: movd %r12d, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
; SSE2-NEXT: orps %xmm3, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE2-NEXT: shll $16, %r15d
; SSE2-NEXT: movd %r15d, %xmm1
-; SSE2-NEXT: shll $16, %ebp
-; SSE2-NEXT: movd %ebp, %xmm2
-; SSE2-NEXT: js .LBB34_7
-; SSE2-NEXT: # %bb.8:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: jmp .LBB34_9
-; SSE2-NEXT: .LBB34_7:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB34_9:
+; SSE2-NEXT: shll $16, %r14d
+; SSE2-NEXT: movd %r14d, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
; SSE2-NEXT: orps %xmm3, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT: shll $16, %r14d
-; SSE2-NEXT: movd %r14d, %xmm1
+; SSE2-NEXT: shll $16, %ebp
+; SSE2-NEXT: movd %ebp, %xmm1
; SSE2-NEXT: shll $16, %ebx
-; SSE2-NEXT: movd %ebx, %xmm2
-; SSE2-NEXT: js .LBB34_10
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: jmp .LBB34_12
-; SSE2-NEXT: .LBB34_10:
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB34_12:
+; SSE2-NEXT: movd %ebx, %xmm0
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: andnps %xmm2, %xmm3
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm0
; SSE2-NEXT: orps %xmm3, %xmm0
; SSE2-NEXT: callq __truncsfbf2 at PLT
; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
@@ -2170,11 +1788,13 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $56, %rsp
; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpextrw $0, %xmm4, %ebx
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpextrw $0, %xmm4, %r14d
+; AVX1-NEXT: vpextrw $0, %xmm2, %ebx
+; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm2
+; AVX1-NEXT: vpextrw $0, %xmm2, %ebp
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpextrw $0, %xmm2, %r14d
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpextrw $0, %xmm2, %r15d
; AVX1-NEXT: vpextrw $0, %xmm0, %r12d
; AVX1-NEXT: vpextrw $0, %xmm1, %r13d
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -2184,71 +1804,45 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
; AVX1-NEXT: shll $16, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: js .LBB34_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vmovdqa %xmm4, %xmm1
-; AVX1-NEXT: jmp .LBB34_3
-; AVX1-NEXT: .LBB34_1:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm4, %xmm0
-; AVX1-NEXT: .LBB34_3:
-; AVX1-NEXT: vpextrw $0, %xmm2, %ebp
-; AVX1-NEXT: vpextrw $0, %xmm3, %r15d
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: shll $16, %r13d
; AVX1-NEXT: vmovd %r13d, %xmm0
; AVX1-NEXT: shll $16, %r12d
-; AVX1-NEXT: vmovd %r12d, %xmm2
-; AVX1-NEXT: js .LBB34_4
-; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_6
-; AVX1-NEXT: .LBB34_4:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_6:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %r12d, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: shll $16, %r15d
+; AVX1-NEXT: vmovd %r15d, %xmm0
; AVX1-NEXT: shll $16, %r14d
-; AVX1-NEXT: vmovd %r14d, %xmm0
-; AVX1-NEXT: shll $16, %ebx
-; AVX1-NEXT: vmovd %ebx, %xmm2
-; AVX1-NEXT: js .LBB34_7
-; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_9
-; AVX1-NEXT: .LBB34_7:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_9:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %r14d, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX1-NEXT: shll $16, %r15d
-; AVX1-NEXT: vmovd %r15d, %xmm0
; AVX1-NEXT: shll $16, %ebp
-; AVX1-NEXT: vmovd %ebp, %xmm2
-; AVX1-NEXT: js .LBB34_10
-; AVX1-NEXT: # %bb.11:
-; AVX1-NEXT: vmovdqa %xmm2, %xmm1
-; AVX1-NEXT: jmp .LBB34_12
-; AVX1-NEXT: .LBB34_10:
-; AVX1-NEXT: vmovdqa %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa %xmm2, %xmm0
-; AVX1-NEXT: .LBB34_12:
-; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %ebp, %xmm0
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovd %ebx, %xmm1
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; AVX1-NEXT: callq __truncsfbf2 at PLT
; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2286,59 +1880,51 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
; AVX512-NEXT: shrq $48, %r12
; AVX512-NEXT: movl %ebp, %eax
; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
-; AVX512-NEXT: sets %cl
-; AVX512-NEXT: kmovw %ecx, %k1
-; AVX512-NEXT: movl %r13d, %ecx
-; AVX512-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
-; AVX512-NEXT: vmovd %ecx, %xmm1
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: movl %r13d, %eax
+; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: shll $16, %ebp
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %ebp, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %r13d
-; AVX512-NEXT: vmovd %r13d, %xmm1
-; AVX512-NEXT: vmovd %ebp, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovd %r13d, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp)
; AVX512-NEXT: shll $16, %r12d
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %r12d, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %r15d
-; AVX512-NEXT: vmovd %r15d, %xmm1
-; AVX512-NEXT: vmovd %r12d, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovd %r15d, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: shll $16, %r14d
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovd %r14d, %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: shll $16, %ebx
-; AVX512-NEXT: vmovd %ebx, %xmm1
-; AVX512-NEXT: vmovd %r14d, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovd %ebx, %xmm2
+; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm3
+; AVX512-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: callq __truncsfbf2 at PLT
; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2365,90 +1951,67 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
; X86-NEXT: pushl %esi
; X86-NEXT: subl $68, %esp
; X86-NEXT: vpsrlq $48, %xmm0, %xmm2
-; X86-NEXT: vpsrlq $48, %xmm1, %xmm3
-; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; X86-NEXT: vpextrw $0, %xmm4, %esi
-; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; X86-NEXT: vpextrw $0, %xmm4, %ebx
+; X86-NEXT: vpextrw $0, %xmm2, %esi
+; X86-NEXT: vpsrlq $48, %xmm1, %xmm2
+; X86-NEXT: vpextrw $0, %xmm2, %edi
+; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-NEXT: vpextrw $0, %xmm2, %ebx
+; X86-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X86-NEXT: vpextrw $0, %xmm2, %ebp
; X86-NEXT: vpextrw $0, %xmm0, %eax
-; X86-NEXT: vpextrw $0, %xmm1, %ecx
; X86-NEXT: vpsrld $16, %xmm0, %xmm0
-; X86-NEXT: vpextrw $0, %xmm0, %edx
-; X86-NEXT: vpsrld $16, %xmm1, %xmm0
-; X86-NEXT: vpextrw $0, %xmm0, %edi
-; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm0
-; X86-NEXT: shll $16, %edx
-; X86-NEXT: vmovd %edx, %xmm4
-; X86-NEXT: js .LBB34_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: vmovdqa %xmm4, %xmm1
-; X86-NEXT: jmp .LBB34_3
-; X86-NEXT: .LBB34_1:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm4, %xmm0
-; X86-NEXT: .LBB34_3:
-; X86-NEXT: vpextrw $0, %xmm2, %edi
-; X86-NEXT: vpextrw $0, %xmm3, %ebp
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT: vpsrld $16, %xmm1, %xmm2
+; X86-NEXT: vpextrw $0, %xmm2, %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm2
+; X86-NEXT: vpextrw $0, %xmm0, %ecx
+; X86-NEXT: shll $16, %ecx
+; X86-NEXT: vmovd %ecx, %xmm0
+; X86-NEXT: vpextrw $0, %xmm1, %ecx
+; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
+; X86-NEXT: vorps %xmm4, %xmm0, %xmm3
+; X86-NEXT: vandps %xmm1, %xmm3, %xmm1
+; X86-NEXT: vcmpunordss %xmm2, %xmm2, %xmm2
; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: shll $16, %ecx
; X86-NEXT: vmovd %ecx, %xmm0
; X86-NEXT: shll $16, %eax
-; X86-NEXT: vmovd %eax, %xmm2
-; X86-NEXT: js .LBB34_4
-; X86-NEXT: # %bb.5:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_6
-; X86-NEXT: .LBB34_4:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_6:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %eax, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vorps %xmm4, %xmm1, %xmm3
+; X86-NEXT: vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovss %xmm0, (%esp)
+; X86-NEXT: shll $16, %ebp
+; X86-NEXT: vmovd %ebp, %xmm0
; X86-NEXT: shll $16, %ebx
-; X86-NEXT: vmovd %ebx, %xmm0
-; X86-NEXT: shll $16, %esi
-; X86-NEXT: vmovd %esi, %xmm2
-; X86-NEXT: js .LBB34_7
-; X86-NEXT: # %bb.8:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_9
-; X86-NEXT: .LBB34_7:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_9:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %ebx, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
+; X86-NEXT: vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
; X86-NEXT: vmovss %xmm0, (%esp)
-; X86-NEXT: shll $16, %ebp
-; X86-NEXT: vmovd %ebp, %xmm0
; X86-NEXT: shll $16, %edi
-; X86-NEXT: vmovd %edi, %xmm2
-; X86-NEXT: js .LBB34_10
-; X86-NEXT: # %bb.11:
-; X86-NEXT: vmovdqa %xmm2, %xmm1
-; X86-NEXT: jmp .LBB34_12
-; X86-NEXT: .LBB34_10:
-; X86-NEXT: vmovdqa %xmm0, %xmm1
-; X86-NEXT: vmovdqa %xmm2, %xmm0
-; X86-NEXT: .LBB34_12:
-; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT: vmovd %edi, %xmm0
+; X86-NEXT: shll $16, %esi
+; X86-NEXT: vmovd %esi, %xmm1
+; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2
+; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm3
+; X86-NEXT: vandps %xmm2, %xmm3, %xmm2
+; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __truncsfbf2
; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
index 008e3e4c217cb..fcec1a3ce5616 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -21,82 +21,57 @@ define float @test_v1f32(<1 x float> %a0) {
define float @test_v2f32(<2 x float> %a0) {
; SSE2-LABEL: test_v2f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: cmpunordss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm2
+; SSE2-NEXT: andps %xmm0, %xmm2
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: js .LBB1_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: .LBB1_2:
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: cmpunordss %xmm3, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB1_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: .LBB1_4:
-; SSE2-NEXT: maxss %xmm2, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm1
-; SSE2-NEXT: orps %xmm4, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: maxss %xmm3, %xmm4
+; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm1
+; SSE2-NEXT: orps %xmm2, %xmm1
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB1_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: jmp .LBB1_3
-; SSE41-NEXT: .LBB1_1:
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: .LBB1_3:
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: maxss %xmm3, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; SSE41-NEXT: orps %xmm1, %xmm2
+; SSE41-NEXT: andps %xmm3, %xmm2
; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v2f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB1_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: jmp .LBB1_3
-; AVX-NEXT: .LBB1_1:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: vmovaps %xmm0, %xmm1
-; AVX-NEXT: .LBB1_3:
-; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX512BW-LABEL: test_v2f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vorps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vmaxss %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT: vandps %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %xmm1, %xmm0
+; AVX512BW-NEXT: retq
;
-; AVX512-LABEL: test_v2f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT: vmovaps %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: retq
+; AVX512VL-LABEL: test_v2f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 & (xmm0 | m32bcst)
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
+; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a0)
ret float %1
}
@@ -104,205 +79,145 @@ define float @test_v2f32(<2 x float> %a0) {
define float @test_v4f32(<4 x float> %a0) {
; SSE2-LABEL: test_v4f32:
; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: js .LBB2_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movaps %xmm3, %xmm4
-; SSE2-NEXT: .LBB2_2:
-; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: movaps %xmm0, %xmm6
+; SSE2-NEXT: cmpunordss %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm6, %xmm7
+; SSE2-NEXT: andps %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3]
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT: movaps %xmm4, %xmm2
-; SSE2-NEXT: cmpunordss %xmm4, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm5
-; SSE2-NEXT: andps %xmm4, %xmm5
-; SSE2-NEXT: js .LBB2_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movaps %xmm0, %xmm3
-; SSE2-NEXT: .LBB2_4:
-; SSE2-NEXT: maxss %xmm3, %xmm4
-; SSE2-NEXT: andnps %xmm4, %xmm2
-; SSE2-NEXT: orps %xmm5, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm2, %xmm4
-; SSE2-NEXT: js .LBB2_6
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: .LBB2_6:
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE2-NEXT: movaps %xmm4, %xmm3
-; SSE2-NEXT: cmpunordss %xmm4, %xmm3
-; SSE2-NEXT: movaps %xmm3, %xmm5
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE2-NEXT: maxss %xmm3, %xmm5
+; SSE2-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
+; SSE2-NEXT: orps %xmm3, %xmm4
; SSE2-NEXT: andps %xmm4, %xmm5
-; SSE2-NEXT: js .LBB2_8
-; SSE2-NEXT: # %bb.7:
-; SSE2-NEXT: movaps %xmm2, %xmm1
-; SSE2-NEXT: .LBB2_8:
-; SSE2-NEXT: maxss %xmm1, %xmm4
-; SSE2-NEXT: andnps %xmm4, %xmm3
-; SSE2-NEXT: orps %xmm5, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: js .LBB2_10
-; SSE2-NEXT: # %bb.9:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: .LBB2_10:
-; SSE2-NEXT: movaps %xmm2, %xmm1
-; SSE2-NEXT: cmpunordss %xmm2, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm2, %xmm4
-; SSE2-NEXT: js .LBB2_12
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movaps %xmm3, %xmm0
-; SSE2-NEXT: .LBB2_12:
-; SSE2-NEXT: maxss %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm2, %xmm1
-; SSE2-NEXT: orps %xmm4, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: andnps %xmm5, %xmm7
+; SSE2-NEXT: orps %xmm7, %xmm6
+; SSE2-NEXT: movaps %xmm6, %xmm4
+; SSE2-NEXT: cmpunordss %xmm6, %xmm4
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: andps %xmm6, %xmm0
+; SSE2-NEXT: movaps %xmm6, %xmm5
+; SSE2-NEXT: orps %xmm3, %xmm5
+; SSE2-NEXT: maxss %xmm1, %xmm6
+; SSE2-NEXT: andps %xmm5, %xmm6
+; SSE2-NEXT: andnps %xmm6, %xmm4
+; SSE2-NEXT: orps %xmm0, %xmm4
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: cmpunordss %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
+; SSE2-NEXT: orps %xmm4, %xmm3
+; SSE2-NEXT: maxss %xmm2, %xmm4
+; SSE2-NEXT: andps %xmm3, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB2_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: jmp .LBB2_3
-; SSE41-NEXT: .LBB2_1:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: .LBB2_3:
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: movaps %xmm3, %xmm4
-; SSE41-NEXT: maxss %xmm0, %xmm4
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordss %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: movd %xmm4, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB2_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm4, %xmm0
-; SSE41-NEXT: jmp .LBB2_6
-; SSE41-NEXT: .LBB2_4:
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: movaps %xmm4, %xmm2
-; SSE41-NEXT: .LBB2_6:
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB2_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB2_9
-; SSE41-NEXT: .LBB2_7:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: .LBB2_9:
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: maxss %xmm0, %xmm2
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: orps %xmm2, %xmm4
+; SSE41-NEXT: movaps %xmm1, %xmm5
+; SSE41-NEXT: maxss %xmm0, %xmm5
+; SSE41-NEXT: andps %xmm4, %xmm5
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5
+; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm1[3,3]
+; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: orps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm5, %xmm6
+; SSE41-NEXT: maxss %xmm3, %xmm6
+; SSE41-NEXT: andps %xmm0, %xmm6
+; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: cmpunordss %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm6
+; SSE41-NEXT: orps %xmm6, %xmm2
+; SSE41-NEXT: movaps %xmm6, %xmm1
+; SSE41-NEXT: maxss %xmm4, %xmm1
+; SSE41-NEXT: andps %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm6, %xmm0
+; SSE41-NEXT: cmpunordss %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB2_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovaps %xmm0, %xmm3
-; AVX-NEXT: jmp .LBB2_3
-; AVX-NEXT: .LBB2_1:
-; AVX-NEXT: vmovaps %xmm2, %xmm3
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: .LBB2_3:
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm3
-; AVX-NEXT: vcmpunordss %xmm2, %xmm2, %xmm4
-; AVX-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vmovd %xmm3, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB2_4
-; AVX-NEXT: # %bb.5:
-; AVX-NEXT: vmovaps %xmm3, %xmm2
-; AVX-NEXT: jmp .LBB2_6
-; AVX-NEXT: .LBB2_4:
-; AVX-NEXT: vmovapd %xmm1, %xmm2
-; AVX-NEXT: vmovaps %xmm3, %xmm1
-; AVX-NEXT: .LBB2_6:
-; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB2_7
-; AVX-NEXT: # %bb.8:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB2_9
-; AVX-NEXT: .LBB2_7:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: vmovaps %xmm1, %xmm0
-; AVX-NEXT: .LBB2_9:
-; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vorps %xmm4, %xmm0, %xmm5
+; AVX-NEXT: vmaxss %xmm3, %xmm0, %xmm3
+; AVX-NEXT: vandps %xmm3, %xmm5, %xmm3
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm5
+; AVX-NEXT: vblendvps %xmm5, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vorps %xmm4, %xmm0, %xmm3
+; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vandps %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vorps %xmm4, %xmm0, %xmm2
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v4f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vmovaps %xmm0, %xmm4
-; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm3, %xmm3, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vmovaps %xmm0, %xmm3
-; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmaxss %xmm3, %xmm2, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm2, %xmm2, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: testl %eax, %eax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vmovaps %xmm0, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v4f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vorps %xmm4, %xmm0, %xmm5
+; AVX512BW-NEXT: vmaxss %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vandps %xmm3, %xmm5, %xmm3
+; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512BW-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512BW-NEXT: vorps %xmm4, %xmm3, %xmm0
+; AVX512BW-NEXT: vmaxss %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vandps %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vorps %xmm4, %xmm2, %xmm0
+; AVX512BW-NEXT: vmaxss %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1
+; AVX512BW-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4f32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX512VL-NEXT: vmaxss %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm3 = xmm3 & (xmm0 | xmm4)
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm2 = xmm2 & (xmm3 | xmm4)
+; AVX512VL-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm2 | xmm4)
+; AVX512VL-NEXT: vcmpunordss %xmm2, %xmm2, %k1
+; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a0)
ret float %1
}
@@ -310,284 +225,175 @@ define float @test_v4f32(<4 x float> %a0) {
define float @test_v8f32(<8 x float> %a0) {
; SSE2-LABEL: test_v8f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: maxps %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: cmpunordps %xmm2, %xmm0
-; SSE2-NEXT: andps %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: js .LBB3_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB3_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: cmpunordss %xmm3, %xmm1
-; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB3_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: .LBB3_4:
-; SSE2-NEXT: maxss %xmm2, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm1
-; SSE2-NEXT: orps %xmm4, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: js .LBB3_6
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: .LBB3_6:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: cmpunordss %xmm4, %xmm2
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: maxps %xmm1, %xmm2
+; SSE2-NEXT: movaps {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: orps %xmm1, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordps %xmm0, %xmm2
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andnps %xmm3, %xmm2
+; SSE2-NEXT: orps %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
; SSE2-NEXT: movaps %xmm2, %xmm5
-; SSE2-NEXT: andps %xmm4, %xmm5
-; SSE2-NEXT: js .LBB3_8
-; SSE2-NEXT: # %bb.7:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB3_8:
-; SSE2-NEXT: maxss %xmm3, %xmm4
-; SSE2-NEXT: andnps %xmm4, %xmm2
-; SSE2-NEXT: orps %xmm5, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: js .LBB3_10
-; SSE2-NEXT: # %bb.9:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB3_10:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordss %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[3,3,3,3]
+; SSE2-NEXT: maxss %xmm4, %xmm2
+; SSE2-NEXT: orps %xmm1, %xmm5
+; SSE2-NEXT: andps %xmm2, %xmm5
+; SSE2-NEXT: andnps %xmm5, %xmm0
+; SSE2-NEXT: orps %xmm3, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB3_12
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB3_12:
-; SSE2-NEXT: maxss %xmm1, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm4
+; SSE2-NEXT: maxss %xmm6, %xmm0
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm0, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: cmpunordss %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: orps %xmm2, %xmm1
+; SSE2-NEXT: maxss %xmm7, %xmm2
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: andnps %xmm2, %xmm0
+; SSE2-NEXT: orps %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: blendvps %xmm0, %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: maxps %xmm3, %xmm2
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordps %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE41-NEXT: movd %xmm2, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB3_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: jmp .LBB3_3
-; SSE41-NEXT: .LBB3_1:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: .LBB3_3:
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB3_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB3_6
-; SSE41-NEXT: .LBB3_4:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: .LBB3_6:
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB3_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB3_9
-; SSE41-NEXT: .LBB3_7:
+; SSE41-NEXT: movaps %xmm0, %xmm2
+; SSE41-NEXT: maxps %xmm1, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
+; SSE41-NEXT: movaps %xmm2, %xmm4
+; SSE41-NEXT: orps %xmm3, %xmm4
+; SSE41-NEXT: andps %xmm0, %xmm4
; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm2
-; SSE41-NEXT: .LBB3_9:
-; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: cmpunordps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE41-NEXT: movaps %xmm4, %xmm1
; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: orps %xmm3, %xmm2
+; SSE41-NEXT: andps %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: cmpunordss %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
+; SSE41-NEXT: movaps %xmm2, %xmm5
+; SSE41-NEXT: maxss %xmm1, %xmm5
+; SSE41-NEXT: andps %xmm0, %xmm5
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm5
+; SSE41-NEXT: orps %xmm5, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: maxss %xmm4, %xmm1
+; SSE41-NEXT: andps %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: cmpunordss %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB3_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: jmp .LBB3_3
-; AVX-NEXT: .LBB3_1:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: vmovaps %xmm0, %xmm1
-; AVX-NEXT: .LBB3_3:
-; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovd %xmm2, %eax
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB3_4
-; AVX-NEXT: # %bb.5:
-; AVX-NEXT: vmovaps %xmm2, %xmm3
-; AVX-NEXT: jmp .LBB3_6
-; AVX-NEXT: .LBB3_4:
-; AVX-NEXT: vmovapd %xmm1, %xmm3
-; AVX-NEXT: vmovaps %xmm2, %xmm1
-; AVX-NEXT: .LBB3_6:
-; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm1, %xmm3
+; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX-NEXT: vmaxss %xmm4, %xmm1, %xmm4
+; AVX-NEXT: vandps %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm4
+; AVX-NEXT: vblendvps %xmm4, %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm1, %xmm2
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB3_7
-; AVX-NEXT: # %bb.8:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB3_9
-; AVX-NEXT: .LBB3_7:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: vmovaps %xmm1, %xmm0
-; AVX-NEXT: .LBB3_9:
-; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v8f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovaps %xmm0, %xmm2
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vorps %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512BW-NEXT: vmaxss %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT: vandps %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512BW-NEXT: vmovaps %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
-; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512BW-NEXT: vmovaps %xmm2, %xmm0
-; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vorps %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT: vandps %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vblendmps %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 & (xmm0 | xmm2)
+; AVX512VL-NEXT: vcmpunordps %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovaps %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxps %xmm2, %xmm1, %xmm0
-; AVX512VL-NEXT: vcmpunordps %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovaps %xmm0, %xmm2
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxss %xmm2, %xmm1, %xmm2
-; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vmovaps %xmm2, %xmm3
-; AVX512VL-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
-; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
-; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512VL-NEXT: vmaxss %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm3 = xmm3 & (xmm0 | xmm2)
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1}
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm3 | xmm2)
+; AVX512VL-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a0)
@@ -597,357 +403,228 @@ define float @test_v8f32(<8 x float> %a0) {
define float @test_v16f32(<16 x float> %a0) {
; SSE2-LABEL: test_v16f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm0, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pandn %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: maxps %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: cmpunordps %xmm5, %xmm0
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: maxps %xmm2, %xmm4
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: orps %xmm2, %xmm5
+; SSE2-NEXT: andps %xmm4, %xmm5
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: cmpunordps %xmm0, %xmm4
+; SSE2-NEXT: andps %xmm4, %xmm0
+; SSE2-NEXT: andnps %xmm5, %xmm4
+; SSE2-NEXT: orps %xmm0, %xmm4
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: cmpunordps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm5
; SSE2-NEXT: andps %xmm0, %xmm5
-; SSE2-NEXT: andnps %xmm6, %xmm0
+; SSE2-NEXT: movaps %xmm1, %xmm6
+; SSE2-NEXT: maxps %xmm3, %xmm6
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: orps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm6, %xmm3
+; SSE2-NEXT: movaps %xmm1, %xmm6
+; SSE2-NEXT: cmpunordps %xmm1, %xmm6
+; SSE2-NEXT: andps %xmm6, %xmm1
+; SSE2-NEXT: andnps %xmm3, %xmm6
+; SSE2-NEXT: orps %xmm1, %xmm6
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: maxps %xmm6, %xmm1
+; SSE2-NEXT: orps %xmm2, %xmm4
+; SSE2-NEXT: andps %xmm1, %xmm4
+; SSE2-NEXT: andnps %xmm4, %xmm0
; SSE2-NEXT: orps %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pandn %xmm3, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: maxps %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: cmpunordps %xmm4, %xmm3
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: andnps %xmm1, %xmm3
-; SSE2-NEXT: orps %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: maxps %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: cmpunordps %xmm2, %xmm0
-; SSE2-NEXT: andps %xmm0, %xmm2
-; SSE2-NEXT: andnps %xmm1, %xmm0
-; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: js .LBB4_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB4_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: cmpunordss %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: cmpunordss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: andps %xmm0, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
+; SSE2-NEXT: maxss %xmm4, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm5
+; SSE2-NEXT: andps %xmm0, %xmm5
+; SSE2-NEXT: andnps %xmm5, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: cmpunordss %xmm1, %xmm3
+; SSE2-NEXT: movaps %xmm3, %xmm0
+; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: movaps %xmm1, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB4_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: .LBB4_4:
-; SSE2-NEXT: maxss %xmm2, %xmm3
-; SSE2-NEXT: andnps %xmm3, %xmm1
-; SSE2-NEXT: orps %xmm4, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: js .LBB4_6
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: .LBB4_6:
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: cmpunordss %xmm4, %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm5
-; SSE2-NEXT: andps %xmm4, %xmm5
-; SSE2-NEXT: js .LBB4_8
-; SSE2-NEXT: # %bb.7:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB4_8:
-; SSE2-NEXT: maxss %xmm3, %xmm4
-; SSE2-NEXT: andnps %xmm4, %xmm2
-; SSE2-NEXT: orps %xmm5, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; SSE2-NEXT: testl %eax, %eax
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: js .LBB4_10
-; SSE2-NEXT: # %bb.9:
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: .LBB4_10:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: orps %xmm2, %xmm4
+; SSE2-NEXT: maxss %xmm6, %xmm1
+; SSE2-NEXT: andps %xmm4, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm3
+; SSE2-NEXT: orps %xmm0, %xmm3
+; SSE2-NEXT: movaps %xmm3, %xmm0
; SSE2-NEXT: cmpunordss %xmm3, %xmm0
-; SSE2-NEXT: movaps %xmm0, %xmm4
-; SSE2-NEXT: andps %xmm3, %xmm4
-; SSE2-NEXT: js .LBB4_12
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: .LBB4_12:
-; SSE2-NEXT: maxss %xmm1, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: andps %xmm3, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: maxss %xmm7, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
; SSE2-NEXT: andnps %xmm3, %xmm0
-; SSE2-NEXT: orps %xmm4, %xmm0
+; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: movaps %xmm1, %xmm6
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm6
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movaps %xmm3, %xmm5
-; SSE41-NEXT: maxps %xmm6, %xmm5
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordps %xmm3, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: movaps %xmm4, %xmm3
-; SSE41-NEXT: movaps %xmm4, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: maxps %xmm3, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: cmpunordps %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: maxps %xmm3, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
+; SSE41-NEXT: movaps %xmm1, %xmm5
+; SSE41-NEXT: orps %xmm3, %xmm5
+; SSE41-NEXT: andps %xmm0, %xmm5
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
+; SSE41-NEXT: cmpunordps %xmm1, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movaps %xmm5, %xmm2
-; SSE41-NEXT: maxps %xmm3, %xmm2
-; SSE41-NEXT: movaps %xmm5, %xmm0
-; SSE41-NEXT: cmpunordps %xmm5, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE41-NEXT: movd %xmm2, %eax
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB4_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: jmp .LBB4_3
-; SSE41-NEXT: .LBB4_1:
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: .LBB4_3:
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB4_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB4_6
-; SSE41-NEXT: .LBB4_4:
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: maxps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm1
+; SSE41-NEXT: orps %xmm3, %xmm1
+; SSE41-NEXT: andps %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: cmpunordps %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: .LBB4_6:
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: maxps %xmm5, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: orps %xmm3, %xmm2
+; SSE41-NEXT: andps %xmm0, %xmm2
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movd %xmm3, %eax
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: js .LBB4_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB4_9
-; SSE41-NEXT: .LBB4_7:
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm2
-; SSE41-NEXT: .LBB4_9:
+; SSE41-NEXT: cmpunordps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm4
+; SSE41-NEXT: orps %xmm3, %xmm4
+; SSE41-NEXT: andps %xmm1, %xmm4
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm4
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE41-NEXT: movaps %xmm4, %xmm5
+; SSE41-NEXT: maxss %xmm1, %xmm5
+; SSE41-NEXT: andps %xmm0, %xmm5
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: cmpunordss %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm5
+; SSE41-NEXT: orps %xmm5, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT: movaps %xmm5, %xmm1
+; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: andps %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm5, %xmm0
+; SSE41-NEXT: cmpunordss %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
; AVX: # %bb.0:
-; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmaxps %ymm2, %ymm0, %ymm1
-; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2
-; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX-NEXT: vorps %ymm2, %ymm0, %ymm3
+; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm3
+; AVX-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: jmp .LBB4_3
-; AVX-NEXT: .LBB4_1:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: vmovaps %xmm0, %xmm1
-; AVX-NEXT: .LBB4_3:
-; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovd %xmm2, %eax
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_4
-; AVX-NEXT: # %bb.5:
-; AVX-NEXT: vmovaps %xmm2, %xmm3
-; AVX-NEXT: jmp .LBB4_6
-; AVX-NEXT: .LBB4_4:
-; AVX-NEXT: vmovapd %xmm1, %xmm3
-; AVX-NEXT: vmovaps %xmm2, %xmm1
-; AVX-NEXT: .LBB4_6:
-; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm1, %xmm3
+; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX-NEXT: vmaxss %xmm4, %xmm1, %xmm4
+; AVX-NEXT: vandps %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm4
+; AVX-NEXT: vblendvps %xmm4, %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vorps %xmm2, %xmm1, %xmm2
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_7
-; AVX-NEXT: # %bb.8:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB4_9
-; AVX-NEXT: .LBB4_7:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: vmovaps %xmm1, %xmm0
-; AVX-NEXT: .LBB4_9:
-; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandps %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2
+; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512BW-LABEL: test_v16f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vmaxps %ymm2, %ymm0, %ymm1
-; AVX512BW-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmaxps %ymm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vorps %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vandps %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vcmpunordps %ymm0, %ymm0, %ymm3
+; AVX512BW-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxps %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordps %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovaps %xmm0, %xmm2
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512BW-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorps %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandps %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vorps %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512BW-NEXT: vmaxss %xmm4, %xmm1, %xmm4
+; AVX512BW-NEXT: vandps %xmm4, %xmm3, %xmm3
; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512BW-NEXT: vmovaps %xmm2, %xmm3
; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
-; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: testl %eax, %eax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512BW-NEXT: vmovaps %xmm2, %xmm0
-; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxss %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vorps %xmm2, %xmm3, %xmm1
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512BW-NEXT: vmaxss %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT: vandps %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512BW-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v16f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vblendmps %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX512VL-NEXT: vpternlogd {{.*#+}} ymm1 = ymm1 & (ymm0 | ymm2)
+; AVX512VL-NEXT: vcmpunordps %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vmovaps %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmaxps %ymm2, %ymm1, %ymm0
-; AVX512VL-NEXT: vcmpunordps %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 {%k1}
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vblendmps %xmm1, %xmm0, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovaps %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxps %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT: vmaxps %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordps %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovaps %xmm0, %xmm2
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
+; AVX512VL-NEXT: vmaxss %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 & (xmm0 | xmm2)
+; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxss %xmm2, %xmm1, %xmm2
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512VL-NEXT: vmaxss %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm3 = xmm3 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vmovaps %xmm2, %xmm3
; AVX512VL-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1}
-; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: testl %eax, %eax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
-; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vcmpunordss %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm3 | xmm2)
+; AVX512VL-NEXT: vcmpunordss %xmm3, %xmm3, %k1
+; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %a0)
@@ -961,83 +638,66 @@ define float @test_v16f32(<16 x float> %a0) {
define double @test_v2f64(<2 x double> %a0) {
; SSE2-LABEL: test_v2f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: js .LBB5_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB5_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: cmpunordsd %xmm3, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm4
-; SSE2-NEXT: andpd %xmm3, %xmm4
-; SSE2-NEXT: js .LBB5_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: .LBB5_4:
-; SSE2-NEXT: maxsd %xmm2, %xmm3
-; SSE2-NEXT: andnpd %xmm3, %xmm1
-; SSE2-NEXT: orpd %xmm4, %xmm1
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm3
+; SSE2-NEXT: andpd %xmm0, %xmm3
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: maxsd %xmm2, %xmm4
+; SSE2-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: andpd %xmm4, %xmm0
+; SSE2-NEXT: andnpd %xmm0, %xmm1
+; SSE2-NEXT: orpd %xmm3, %xmm1
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB5_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: jmp .LBB5_3
-; SSE41-NEXT: .LBB5_1:
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: .LBB5_3:
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: maxsd %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm0, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: maxsd %xmm0, %xmm3
+; SSE41-NEXT: movapd {{.*#+}} xmm2 = [NaN,NaN]
+; SSE41-NEXT: orpd %xmm1, %xmm2
+; SSE41-NEXT: andpd %xmm3, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: js .LBB5_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovapd %xmm0, %xmm2
-; AVX-NEXT: jmp .LBB5_3
-; AVX-NEXT: .LBB5_1:
-; AVX-NEXT: vmovapd %xmm1, %xmm2
-; AVX-NEXT: vmovapd %xmm0, %xmm1
-; AVX-NEXT: .LBB5_3:
-; AVX-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vandpd %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: test_v2f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: testq %rax, %rax
-; AVX512-NEXT: sets %al
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vmovapd %xmm0, %xmm2
-; AVX512-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
-; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: test_v2f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512BW-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX512BW-NEXT: vandpd %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512BW-NEXT: vmovapd %xmm1, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v2f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512VL-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 & (xmm0 | m64bcst)
+; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
+; AVX512VL-NEXT: retq
%1 = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a0)
ret double %1
}
@@ -1045,96 +705,67 @@ define double @test_v2f64(<2 x double> %a0) {
define double @test_v4f64(<4 x double> %a0) {
; SSE2-LABEL: test_v4f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: maxpd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: cmpunordpd %xmm3, %xmm1
-; SSE2-NEXT: andpd %xmm1, %xmm3
-; SSE2-NEXT: andnpd %xmm0, %xmm1
-; SSE2-NEXT: orpd %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: js .LBB6_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB6_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm4
-; SSE2-NEXT: andpd %xmm3, %xmm4
-; SSE2-NEXT: js .LBB6_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: .LBB6_4:
-; SSE2-NEXT: maxsd %xmm2, %xmm3
-; SSE2-NEXT: andnpd %xmm3, %xmm0
-; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: maxpd %xmm1, %xmm2
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [NaN,NaN]
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: orpd %xmm1, %xmm3
+; SSE2-NEXT: andpd %xmm2, %xmm3
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm3, %xmm2
+; SSE2-NEXT: orpd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: andpd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
+; SSE2-NEXT: movapd %xmm2, %xmm5
+; SSE2-NEXT: maxsd %xmm4, %xmm5
+; SSE2-NEXT: orpd %xmm1, %xmm2
+; SSE2-NEXT: andpd %xmm5, %xmm2
+; SSE2-NEXT: andnpd %xmm2, %xmm0
+; SSE2-NEXT: orpd %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f64:
; SSE41: # %bb.0:
; SSE41-NEXT: movapd %xmm0, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: maxpd %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSE41-NEXT: movq %xmm3, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB6_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: jmp .LBB6_3
-; SSE41-NEXT: .LBB6_1:
+; SSE41-NEXT: maxpd %xmm1, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm1 = [NaN,NaN]
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: orpd %xmm1, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: .LBB6_3:
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: maxsd %xmm0, %xmm1
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm0, %xmm2
+; SSE41-NEXT: orpd %xmm3, %xmm1
+; SSE41-NEXT: andpd %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f64:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX-NEXT: vmovq %xmm1, %rax
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: js .LBB6_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovapd %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB6_3
-; AVX-NEXT: .LBB6_1:
-; AVX-NEXT: vmovapd %xmm0, %xmm2
-; AVX-NEXT: vmovapd %xmm1, %xmm0
-; AVX-NEXT: .LBB6_3:
-; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [NaN,NaN]
+; AVX-NEXT: # xmm2 = mem[0,0]
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandpd %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vandpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
@@ -1143,20 +774,17 @@ define double @test_v4f64(<4 x double> %a0) {
; AVX512BW-LABEL: test_v4f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
-; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: testq %rax, %rax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
-; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vmovddup {{.*#+}} xmm2 = [NaN,NaN]
+; AVX512BW-NEXT: # xmm2 = mem[0,0]
+; AVX512BW-NEXT: vorpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vorpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vandpd %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
@@ -1164,23 +792,15 @@ define double @test_v4f64(<4 x double> %a0) {
;
; AVX512VL-LABEL: test_v4f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [NaN,NaN]
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 & (xmm0 | xmm2)
+; AVX512VL-NEXT: vcmpunordpd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
-; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: testq %rax, %rax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
-; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
@@ -1192,158 +812,108 @@ define double @test_v4f64(<4 x double> %a0) {
define double @test_v8f64(<8 x double> %a0) {
; SSE2-LABEL: test_v8f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[3,3]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm2, %xmm7
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: por %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: maxpd %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: cmpunordpd %xmm6, %xmm2
-; SSE2-NEXT: andpd %xmm2, %xmm6
-; SSE2-NEXT: andnpd %xmm0, %xmm2
-; SSE2-NEXT: orpd %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE2-NEXT: xorpd %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: movaps %xmm1, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[3,3]
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm7
-; SSE2-NEXT: pandn %xmm3, %xmm7
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: por %xmm6, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: maxpd %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: cmpunordpd %xmm4, %xmm3
-; SSE2-NEXT: andpd %xmm3, %xmm4
-; SSE2-NEXT: andnpd %xmm1, %xmm3
-; SSE2-NEXT: orpd %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: maxpd %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: cmpunordpd %xmm0, %xmm1
-; SSE2-NEXT: andpd %xmm1, %xmm0
-; SSE2-NEXT: andnpd %xmm2, %xmm1
-; SSE2-NEXT: orpd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: js .LBB7_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB7_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
; SSE2-NEXT: movapd %xmm0, %xmm4
-; SSE2-NEXT: andpd %xmm3, %xmm4
-; SSE2-NEXT: js .LBB7_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: .LBB7_4:
-; SSE2-NEXT: maxsd %xmm2, %xmm3
-; SSE2-NEXT: andnpd %xmm3, %xmm0
-; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: maxpd %xmm2, %xmm4
+; SSE2-NEXT: movapd {{.*#+}} xmm2 = [NaN,NaN]
+; SSE2-NEXT: movapd %xmm0, %xmm6
+; SSE2-NEXT: orpd %xmm2, %xmm6
+; SSE2-NEXT: andpd %xmm4, %xmm6
+; SSE2-NEXT: movapd %xmm0, %xmm5
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm5
+; SSE2-NEXT: andpd %xmm5, %xmm0
+; SSE2-NEXT: andnpd %xmm6, %xmm5
+; SSE2-NEXT: orpd %xmm0, %xmm5
+; SSE2-NEXT: movapd %xmm5, %xmm4
+; SSE2-NEXT: cmpunordpd %xmm5, %xmm4
+; SSE2-NEXT: movapd %xmm5, %xmm0
+; SSE2-NEXT: andpd %xmm4, %xmm0
+; SSE2-NEXT: movapd %xmm1, %xmm6
+; SSE2-NEXT: maxpd %xmm3, %xmm6
+; SSE2-NEXT: movapd %xmm1, %xmm3
+; SSE2-NEXT: orpd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm6, %xmm3
+; SSE2-NEXT: movapd %xmm1, %xmm6
+; SSE2-NEXT: cmpunordpd %xmm1, %xmm6
+; SSE2-NEXT: andpd %xmm6, %xmm1
+; SSE2-NEXT: andnpd %xmm3, %xmm6
+; SSE2-NEXT: orpd %xmm1, %xmm6
+; SSE2-NEXT: movapd %xmm5, %xmm1
+; SSE2-NEXT: maxpd %xmm6, %xmm1
+; SSE2-NEXT: orpd %xmm2, %xmm5
+; SSE2-NEXT: andpd %xmm1, %xmm5
+; SSE2-NEXT: andnpd %xmm5, %xmm4
+; SSE2-NEXT: orpd %xmm0, %xmm4
+; SSE2-NEXT: movapd %xmm4, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm4, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: andpd %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
+; SSE2-NEXT: movapd %xmm4, %xmm5
+; SSE2-NEXT: maxsd %xmm3, %xmm5
+; SSE2-NEXT: orpd %xmm2, %xmm4
+; SSE2-NEXT: andpd %xmm5, %xmm4
+; SSE2-NEXT: andnpd %xmm4, %xmm0
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f64:
; SSE41: # %bb.0:
; SSE41-NEXT: movapd %xmm0, %xmm4
-; SSE41-NEXT: movapd %xmm1, %xmm6
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: maxpd %xmm6, %xmm5
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: movapd %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: maxpd %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
+; SSE41-NEXT: maxpd %xmm3, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm3 = [NaN,NaN]
+; SSE41-NEXT: movapd %xmm1, %xmm5
+; SSE41-NEXT: orpd %xmm3, %xmm5
+; SSE41-NEXT: andpd %xmm0, %xmm5
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: maxpd %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: maxpd %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm4, %xmm1
+; SSE41-NEXT: orpd %xmm3, %xmm1
+; SSE41-NEXT: andpd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm4, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm4, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: maxpd %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB7_1
-; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: orpd %xmm3, %xmm2
+; SSE41-NEXT: andpd %xmm0, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: jmp .LBB7_3
-; SSE41-NEXT: .LBB7_1:
+; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: .LBB7_3:
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: maxsd %xmm0, %xmm1
+; SSE41-NEXT: orpd %xmm2, %xmm3
+; SSE41-NEXT: andpd %xmm1, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f64:
; AVX: # %bb.0:
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
-; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vorpd %ymm2, %ymm0, %ymm3
+; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX-NEXT: vmovq %xmm1, %rax
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: js .LBB7_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovapd %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB7_3
-; AVX-NEXT: .LBB7_1:
-; AVX-NEXT: vmovapd %xmm0, %xmm2
-; AVX-NEXT: vmovapd %xmm1, %xmm0
-; AVX-NEXT: .LBB7_3:
-; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm3
+; AVX-NEXT: vandpd %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vandpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
@@ -1352,26 +922,22 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512BW-LABEL: test_v8f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
-; AVX512BW-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmaxpd %ymm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vorpd %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vandpd %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm3
+; AVX512BW-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
-; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: testq %rax, %rax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
-; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vorpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vandpd %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
@@ -1379,33 +945,22 @@ define double @test_v8f64(<8 x double> %a0) {
;
; AVX512VL-LABEL: test_v8f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vblendmpd %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN]
+; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 & (ymm0 | ymm2)
+; AVX512VL-NEXT: vcmpunordpd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vmovapd %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %ymm2, %ymm1, %ymm0
-; AVX512VL-NEXT: vcmpunordpd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT: vmaxpd %xmm0, %xmm1, %xmm3
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: testq %rax, %rax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
-; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
-; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vmovapd %xmm1, %xmm3 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm3[1,0]
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 & (xmm3 | xmm2)
+; AVX512VL-NEXT: vcmpunordsd %xmm3, %xmm3, %k1
+; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%1 = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %a0)
@@ -1415,279 +970,190 @@ define double @test_v8f64(<8 x double> %a0) {
define double @test_v16f64(<16 x double> %a0) {
; SSE2-LABEL: test_v16f64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps %xmm0, %xmm9
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[3,3]
-; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm9
-; SSE2-NEXT: pandn %xmm0, %xmm9
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pandn %xmm4, %xmm11
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: por %xmm9, %xmm4
-; SSE2-NEXT: pand %xmm0, %xmm10
-; SSE2-NEXT: por %xmm11, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm0
-; SSE2-NEXT: maxpd %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm10, %xmm4
-; SSE2-NEXT: cmpunordpd %xmm10, %xmm4
-; SSE2-NEXT: andpd %xmm4, %xmm10
-; SSE2-NEXT: andnpd %xmm0, %xmm4
-; SSE2-NEXT: orpd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
-; SSE2-NEXT: xorpd %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: pandn %xmm4, %xmm9
-; SSE2-NEXT: movaps %xmm2, %xmm10
-; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm2[3,3]
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm10
-; SSE2-NEXT: pandn %xmm2, %xmm10
-; SSE2-NEXT: movdqa %xmm11, %xmm12
-; SSE2-NEXT: pandn %xmm6, %xmm12
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: por %xmm10, %xmm6
-; SSE2-NEXT: pand %xmm2, %xmm11
-; SSE2-NEXT: por %xmm12, %xmm11
-; SSE2-NEXT: movdqa %xmm11, %xmm2
-; SSE2-NEXT: maxpd %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: cmpunordpd %xmm11, %xmm6
-; SSE2-NEXT: andpd %xmm6, %xmm11
-; SSE2-NEXT: andnpd %xmm2, %xmm6
-; SSE2-NEXT: orpd %xmm11, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm6
-; SSE2-NEXT: por %xmm9, %xmm6
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: maxpd %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm0, %xmm8
+; SSE2-NEXT: maxpd %xmm4, %xmm8
+; SSE2-NEXT: movapd {{.*#+}} xmm4 = [NaN,NaN]
+; SSE2-NEXT: movapd %xmm0, %xmm9
+; SSE2-NEXT: orpd %xmm4, %xmm9
+; SSE2-NEXT: andpd %xmm8, %xmm9
+; SSE2-NEXT: movapd %xmm0, %xmm8
+; SSE2-NEXT: cmpunordpd %xmm0, %xmm8
+; SSE2-NEXT: andpd %xmm8, %xmm0
+; SSE2-NEXT: andnpd %xmm9, %xmm8
+; SSE2-NEXT: orpd %xmm0, %xmm8
+; SSE2-NEXT: movapd %xmm8, %xmm0
+; SSE2-NEXT: cmpunordpd %xmm8, %xmm0
+; SSE2-NEXT: movapd %xmm8, %xmm9
+; SSE2-NEXT: andpd %xmm0, %xmm9
+; SSE2-NEXT: movapd %xmm2, %xmm10
+; SSE2-NEXT: maxpd %xmm6, %xmm10
+; SSE2-NEXT: movapd %xmm2, %xmm6
+; SSE2-NEXT: orpd %xmm4, %xmm6
+; SSE2-NEXT: andpd %xmm10, %xmm6
+; SSE2-NEXT: movapd %xmm2, %xmm10
+; SSE2-NEXT: cmpunordpd %xmm2, %xmm10
+; SSE2-NEXT: andpd %xmm10, %xmm2
+; SSE2-NEXT: andnpd %xmm6, %xmm10
+; SSE2-NEXT: orpd %xmm2, %xmm10
+; SSE2-NEXT: movapd %xmm8, %xmm2
+; SSE2-NEXT: maxpd %xmm10, %xmm2
+; SSE2-NEXT: orpd %xmm4, %xmm8
+; SSE2-NEXT: andpd %xmm2, %xmm8
+; SSE2-NEXT: andnpd %xmm8, %xmm0
+; SSE2-NEXT: orpd %xmm9, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm2
; SSE2-NEXT: cmpunordpd %xmm0, %xmm2
-; SSE2-NEXT: andpd %xmm2, %xmm0
-; SSE2-NEXT: andnpd %xmm4, %xmm2
-; SSE2-NEXT: orpd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: xorpd %xmm0, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: movaps %xmm1, %xmm6
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[3,3]
-; SSE2-NEXT: pxor %xmm9, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm9, %xmm10
-; SSE2-NEXT: pandn %xmm5, %xmm10
-; SSE2-NEXT: pand %xmm9, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm1, %xmm9
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: maxpd %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm5
-; SSE2-NEXT: cmpunordpd %xmm9, %xmm5
-; SSE2-NEXT: andpd %xmm5, %xmm9
-; SSE2-NEXT: andnpd %xmm1, %xmm5
-; SSE2-NEXT: orpd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: xorpd %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm5, %xmm6
-; SSE2-NEXT: movaps %xmm3, %xmm9
-; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm3[3,3]
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm9
-; SSE2-NEXT: pandn %xmm3, %xmm9
-; SSE2-NEXT: movdqa %xmm8, %xmm10
-; SSE2-NEXT: pandn %xmm7, %xmm10
-; SSE2-NEXT: pand %xmm8, %xmm7
-; SSE2-NEXT: por %xmm9, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: por %xmm10, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm3
-; SSE2-NEXT: maxpd %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm8, %xmm7
-; SSE2-NEXT: cmpunordpd %xmm8, %xmm7
-; SSE2-NEXT: andpd %xmm7, %xmm8
-; SSE2-NEXT: andnpd %xmm3, %xmm7
-; SSE2-NEXT: orpd %xmm8, %xmm7
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: por %xmm6, %xmm7
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: maxpd %xmm7, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movapd %xmm0, %xmm6
+; SSE2-NEXT: andpd %xmm2, %xmm6
+; SSE2-NEXT: movapd %xmm1, %xmm8
+; SSE2-NEXT: maxpd %xmm5, %xmm8
+; SSE2-NEXT: movapd %xmm1, %xmm9
+; SSE2-NEXT: orpd %xmm4, %xmm9
+; SSE2-NEXT: andpd %xmm8, %xmm9
+; SSE2-NEXT: movapd %xmm1, %xmm5
; SSE2-NEXT: cmpunordpd %xmm1, %xmm5
; SSE2-NEXT: andpd %xmm5, %xmm1
-; SSE2-NEXT: andnpd %xmm3, %xmm5
+; SSE2-NEXT: andnpd %xmm9, %xmm5
; SSE2-NEXT: orpd %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: maxpd %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: cmpunordpd %xmm0, %xmm1
-; SSE2-NEXT: andpd %xmm1, %xmm0
-; SSE2-NEXT: andnpd %xmm2, %xmm1
-; SSE2-NEXT: orpd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: js .LBB8_2
-; SSE2-NEXT: # %bb.1:
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: .LBB8_2:
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: cmpunordsd %xmm3, %xmm0
-; SSE2-NEXT: movapd %xmm0, %xmm4
-; SSE2-NEXT: andpd %xmm3, %xmm4
-; SSE2-NEXT: js .LBB8_4
-; SSE2-NEXT: # %bb.3:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: .LBB8_4:
-; SSE2-NEXT: maxsd %xmm2, %xmm3
-; SSE2-NEXT: andnpd %xmm3, %xmm0
+; SSE2-NEXT: movapd %xmm5, %xmm1
+; SSE2-NEXT: cmpunordpd %xmm5, %xmm1
+; SSE2-NEXT: movapd %xmm5, %xmm8
+; SSE2-NEXT: andpd %xmm1, %xmm8
+; SSE2-NEXT: movapd %xmm3, %xmm9
+; SSE2-NEXT: maxpd %xmm7, %xmm9
+; SSE2-NEXT: movapd %xmm3, %xmm7
+; SSE2-NEXT: orpd %xmm4, %xmm7
+; SSE2-NEXT: andpd %xmm9, %xmm7
+; SSE2-NEXT: movapd %xmm3, %xmm9
+; SSE2-NEXT: cmpunordpd %xmm3, %xmm9
+; SSE2-NEXT: andpd %xmm9, %xmm3
+; SSE2-NEXT: andnpd %xmm7, %xmm9
+; SSE2-NEXT: orpd %xmm3, %xmm9
+; SSE2-NEXT: movapd %xmm5, %xmm3
+; SSE2-NEXT: maxpd %xmm9, %xmm3
+; SSE2-NEXT: orpd %xmm4, %xmm5
+; SSE2-NEXT: andpd %xmm3, %xmm5
+; SSE2-NEXT: andnpd %xmm5, %xmm1
+; SSE2-NEXT: orpd %xmm8, %xmm1
+; SSE2-NEXT: movapd %xmm0, %xmm3
+; SSE2-NEXT: maxpd %xmm1, %xmm3
; SSE2-NEXT: orpd %xmm4, %xmm0
+; SSE2-NEXT: andpd %xmm3, %xmm0
+; SSE2-NEXT: andnpd %xmm0, %xmm2
+; SSE2-NEXT: orpd %xmm6, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: andpd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT: movapd %xmm2, %xmm5
+; SSE2-NEXT: maxsd %xmm3, %xmm5
+; SSE2-NEXT: orpd %xmm4, %xmm2
+; SSE2-NEXT: andpd %xmm5, %xmm2
+; SSE2-NEXT: andnpd %xmm2, %xmm0
+; SSE2-NEXT: orpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm1, %xmm8
-; SSE41-NEXT: movapd %xmm0, %xmm1
-; SSE41-NEXT: movapd %xmm3, %xmm10
+; SSE41-NEXT: movapd %xmm0, %xmm8
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movapd %xmm7, %xmm9
-; SSE41-NEXT: maxpd %xmm10, %xmm9
-; SSE41-NEXT: movapd %xmm7, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm9
-; SSE41-NEXT: movapd %xmm8, %xmm7
-; SSE41-NEXT: movapd %xmm8, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm3
-; SSE41-NEXT: maxpd %xmm7, %xmm3
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
+; SSE41-NEXT: maxpd %xmm7, %xmm0
+; SSE41-NEXT: movapd {{.*#+}} xmm7 = [NaN,NaN]
+; SSE41-NEXT: movapd %xmm3, %xmm9
+; SSE41-NEXT: orpd %xmm7, %xmm9
+; SSE41-NEXT: andpd %xmm0, %xmm9
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm5
+; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movapd %xmm9, %xmm3
-; SSE41-NEXT: maxpd %xmm5, %xmm3
-; SSE41-NEXT: movapd %xmm9, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm2
-; SSE41-NEXT: maxpd %xmm5, %xmm2
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2
-; SSE41-NEXT: movapd %xmm1, %xmm5
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm1
-; SSE41-NEXT: maxpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: cmpunordpd %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
+; SSE41-NEXT: maxpd %xmm5, %xmm0
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: orpd %xmm7, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm3
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: maxpd %xmm4, %xmm1
+; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: maxpd %xmm9, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm1
+; SSE41-NEXT: orpd %xmm7, %xmm1
+; SSE41-NEXT: andpd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: maxpd %xmm6, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: orpd %xmm7, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm3
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm8, %xmm0
+; SSE41-NEXT: maxpd %xmm4, %xmm0
+; SSE41-NEXT: movapd %xmm8, %xmm2
+; SSE41-NEXT: orpd %xmm7, %xmm2
+; SSE41-NEXT: andpd %xmm0, %xmm2
+; SSE41-NEXT: movapd %xmm8, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm8, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: maxpd %xmm3, %xmm0
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: orpd %xmm7, %xmm3
+; SSE41-NEXT: andpd %xmm0, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: maxpd %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: maxpd %xmm2, %xmm1
+; SSE41-NEXT: orpd %xmm7, %xmm1
+; SSE41-NEXT: andpd %xmm0, %xmm1
; SSE41-NEXT: movapd %xmm3, %xmm0
; SSE41-NEXT: cmpunordpd %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB8_1
-; SSE41-NEXT: # %bb.2:
; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: jmp .LBB8_3
-; SSE41-NEXT: .LBB8_1:
-; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: .LBB8_3:
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: maxsd %xmm0, %xmm1
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm0, %xmm2
+; SSE41-NEXT: orpd %xmm1, %xmm7
+; SSE41-NEXT: andpd %xmm2, %xmm7
; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
+; SSE41-NEXT: movapd %xmm7, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f64:
; AVX: # %bb.0:
-; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm4
-; AVX-NEXT: vblendvpd %ymm1, %ymm1, %ymm3, %ymm1
-; AVX-NEXT: vmaxpd %ymm4, %ymm1, %ymm3
-; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm4
-; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
-; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm3
-; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vmaxpd %ymm3, %ymm0, %ymm2
+; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm3
+; AVX-NEXT: vbroadcastsd {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN]
+; AVX-NEXT: vorpd %ymm4, %ymm1, %ymm5
+; AVX-NEXT: vandpd %ymm3, %ymm5, %ymm3
+; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm5
+; AVX-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm2
+; AVX-NEXT: vorpd %ymm4, %ymm0, %ymm3
+; AVX-NEXT: vandpd %ymm2, %ymm3, %ymm2
; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm3
; AVX-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vorpd %ymm4, %ymm0, %ymm2
+; AVX-NEXT: vandpd %ymm1, %ymm2, %ymm1
; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd %xmm4, %xmm0, %xmm2
+; AVX-NEXT: vandpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX-NEXT: vmovq %xmm1, %rax
-; AVX-NEXT: testq %rax, %rax
-; AVX-NEXT: js .LBB8_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovapd %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB8_3
-; AVX-NEXT: .LBB8_1:
-; AVX-NEXT: vmovapd %xmm0, %xmm2
-; AVX-NEXT: vmovapd %xmm1, %xmm0
-; AVX-NEXT: .LBB8_3:
-; AVX-NEXT: vmaxsd %xmm2, %xmm0, %xmm1
+; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vorpd %xmm4, %xmm0, %xmm2
+; AVX-NEXT: vandpd %xmm1, %xmm2, %xmm1
; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
@@ -1695,34 +1161,27 @@ define double @test_v16f64(<16 x double> %a0) {
;
; AVX512BW-LABEL: test_v16f64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
-; AVX512BW-NEXT: vblendmpd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmaxpd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 & (zmm0 | zmm2)
+; AVX512BW-NEXT: vcmpunordpd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT: vcmpunordpd %zmm1, %zmm1, %k1
-; AVX512BW-NEXT: vmovapd %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvpd %ymm0, %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vmaxpd %ymm2, %ymm0, %ymm1
-; AVX512BW-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2
-; AVX512BW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm1, %ymm0
+; AVX512BW-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vorpd %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vandpd %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm3
+; AVX512BW-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vmaxpd %xmm2, %xmm0, %xmm1
-; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2
-; AVX512BW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: testq %rax, %rax
-; AVX512BW-NEXT: sets %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovapd %xmm0, %xmm2
-; AVX512BW-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512BW-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512BW-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512BW-NEXT: vmaxpd %xmm1, %xmm0, %xmm1
+; AVX512BW-NEXT: vorpd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vandpd %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm3
+; AVX512BW-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vorpd %xmm2, %xmm1, %xmm2
+; AVX512BW-NEXT: vandpd %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
; AVX512BW-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vzeroupper
@@ -1730,38 +1189,25 @@ define double @test_v16f64(<16 x double> %a0) {
;
; AVX512VL-LABEL: test_v16f64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
-; AVX512VL-NEXT: vblendmpd %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vmaxpd %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 & (zmm0 | zmm2)
+; AVX512VL-NEXT: vcmpunordpd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0
-; AVX512VL-NEXT: vcmpunordpd %zmm1, %zmm1, %k1
-; AVX512VL-NEXT: vmovapd %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtq %ymm0, %ymm1, %k1
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vblendmpd %ymm1, %ymm0, %ymm2 {%k1}
-; AVX512VL-NEXT: vmovapd %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %ymm2, %ymm1, %ymm0
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm1, %ymm0
+; AVX512VL-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 & (ymm3 | ymm1)
; AVX512VL-NEXT: vcmpunordpd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vmovapd %ymm1, %ymm0 {%k1}
-; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vblendmpd %xmm1, %xmm0, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovapd %xmm0, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxpd %xmm2, %xmm1, %xmm0
-; AVX512VL-NEXT: vcmpunordpd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: testq %rax, %rax
-; AVX512VL-NEXT: sets %al
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vmovapd %xmm0, %xmm2
-; AVX512VL-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
-; AVX512VL-NEXT: vmaxsd %xmm2, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovapd %ymm1, %ymm3 {%k1}
+; AVX512VL-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX512VL-NEXT: vmaxpd %xmm0, %xmm3, %xmm1
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 & (xmm3 | xmm2)
+; AVX512VL-NEXT: vcmpunordpd %xmm3, %xmm3, %k1
+; AVX512VL-NEXT: vmovapd %xmm3, %xmm1 {%k1}
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 & (xmm1 | xmm2)
; AVX512VL-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vzeroupper
@@ -1782,4 +1228,5 @@ declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
; SSE: {{.*}}
>From 1757f2e6f867319e9402781bdebd7f59fdda8c12 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Mon, 1 Dec 2025 05:27:07 -0500
Subject: [PATCH 2/2] [X86] Remove unused check prefix from
vector-reduce-fmaximum.ll
---
llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
index fcec1a3ce5616..ef49ae8402c66 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -3,8 +3,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL
;
; vXf32
@@ -1228,5 +1228,4 @@ declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
; SSE: {{.*}}
More information about the llvm-commits
mailing list