[llvm] [X86] LowerSelect - use BLENDV for scalar selection on all SSE41+ targets (PR #125853)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 02:08:48 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/125853
>From 5e164e0a14c15863c8869cf926b1ec6db21c2274 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 5 Feb 2025 13:09:43 +0000
Subject: [PATCH 1/2] [X86] LowerSelect - use BLENDV for scalar selection if
not all operands are multi use
When we first began (2015) to lower f32/f64 selects to X86ISD::BLENDV(scalar_to_vector(),scalar_to_vector(),scalar_to_vector()), we limited it to AVX targets to avoid issues with SSE41's xmm0 constraint for the condition mask.
Since then we've seen general improvements in TwoAddressInstruction and better handling of condition commutation for X86ISD::BLENDV nodes, which should address many of the original concerns of using SSE41 BLENDVPD/S. If we allow SSE41 cases where the condition and another operand has one use, then the extra moves should never be as bad as the avoided logic ops (we still assume SSE41 BLENDV is more expensive than general logic).
Fixes #105807
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +-
llvm/test/CodeGen/X86/fmaxnum.ll | 124 +++--
llvm/test/CodeGen/X86/fminnum.ll | 124 +++--
llvm/test/CodeGen/X86/fp-select-cmp-and.ll | 10 +-
llvm/test/CodeGen/X86/setcc-combine.ll | 56 ++-
llvm/test/CodeGen/X86/vec_floor.ll | 28 +-
llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 258 +++++-----
.../CodeGen/X86/vector-reduce-fmaximum.ll | 467 +++++++++---------
llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 249 +++++-----
llvm/test/CodeGen/X86/vselect-zero.ll | 47 +-
10 files changed, 722 insertions(+), 652 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6cf6061deba7025..bc55d772b86b8de 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24630,8 +24630,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getTargetConstant(SSECC, DL, MVT::i8));
- // If we have AVX, we can use a variable vector select (VBLENDV) instead
- // of 3 logic instructions for size savings and potentially speed.
+ // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
+ // instead of 3 logic instructions for size savings and potentially speed.
// Unfortunately, there is no scalar form of VBLENDV.
// If either operand is a +0.0 constant, don't try this. We can expect to
@@ -24641,9 +24641,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
// uses XMM0 as the selection register. That may need just as many
// instructions as the AND/ANDN/OR sequence due to register moves, so
- // don't bother.
- if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
- !isNullFPConstant(Op2)) {
+ // only attempt this if at least one of ops (+ condition) are one use.
+ if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
+ !isNullFPConstant(Op2) &&
+ (Subtarget.hasAVX() || Op1->hasOneUse() || Op2->hasOneUse())) {
// Convert to vectors, do a VSELECT, and convert back to scalar.
// All of the conversions should be optimized away.
MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll
index 2e1af1e84e07620..d6252cc85e8b454 100644
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -22,17 +22,26 @@ declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
define float @test_fmaxf(float %x, float %y) {
-; SSE-LABEL: test_fmaxf:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: maxss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_fmaxf:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_fmaxf:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movaps %xmm1, %xmm2
+; SSE4-NEXT: maxss %xmm0, %xmm2
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movaps %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_fmaxf:
; AVX1: # %bb.0:
@@ -63,17 +72,26 @@ define float @test_fmaxf_minsize(float %x, float %y) minsize {
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
define double @test_fmax(double %x, double %y) {
-; SSE-LABEL: test_fmax:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: maxsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_fmax:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: maxsd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_fmax:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movapd %xmm1, %xmm2
+; SSE4-NEXT: maxsd %xmm0, %xmm2
+; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_fmax:
; AVX1: # %bb.0:
@@ -111,17 +129,26 @@ define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
}
define float @test_intrinsic_fmaxf(float %x, float %y) {
-; SSE-LABEL: test_intrinsic_fmaxf:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: maxss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_intrinsic_fmaxf:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: maxss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_intrinsic_fmaxf:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movaps %xmm1, %xmm2
+; SSE4-NEXT: maxss %xmm0, %xmm2
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movaps %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmaxf:
; AVX1: # %bb.0:
@@ -142,17 +169,26 @@ define float @test_intrinsic_fmaxf(float %x, float %y) {
}
define double @test_intrinsic_fmax(double %x, double %y) {
-; SSE-LABEL: test_intrinsic_fmax:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: maxsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_intrinsic_fmax:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: maxsd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_intrinsic_fmax:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movapd %xmm1, %xmm2
+; SSE4-NEXT: maxsd %xmm0, %xmm2
+; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmax:
; AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll
index 1290a7b81910679..0ef8fdec33d937a 100644
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -22,17 +22,26 @@ declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
define float @test_fminf(float %x, float %y) {
-; SSE-LABEL: test_fminf:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: minss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_fminf:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: minss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_fminf:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movaps %xmm1, %xmm2
+; SSE4-NEXT: minss %xmm0, %xmm2
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movaps %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_fminf:
; AVX1: # %bb.0:
@@ -63,17 +72,26 @@ define float @test_fminf_minsize(float %x, float %y) minsize {
; FIXME: As the vector tests show, the SSE run shouldn't need this many moves.
define double @test_fmin(double %x, double %y) {
-; SSE-LABEL: test_fmin:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: minsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_fmin:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: minsd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_fmin:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movapd %xmm1, %xmm2
+; SSE4-NEXT: minsd %xmm0, %xmm2
+; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_fmin:
; AVX1: # %bb.0:
@@ -111,17 +129,26 @@ define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
}
define float @test_intrinsic_fminf(float %x, float %y) {
-; SSE-LABEL: test_intrinsic_fminf:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpunordss %xmm0, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm3
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: minss %xmm0, %xmm1
-; SSE-NEXT: andnps %xmm1, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_intrinsic_fminf:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: cmpunordss %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
+; SSE2-NEXT: minss %xmm0, %xmm1
+; SSE2-NEXT: andnps %xmm1, %xmm2
+; SSE2-NEXT: orps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_intrinsic_fminf:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movaps %xmm1, %xmm2
+; SSE4-NEXT: minss %xmm0, %xmm2
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movaps %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fminf:
; AVX1: # %bb.0:
@@ -142,17 +169,26 @@ define float @test_intrinsic_fminf(float %x, float %y) {
}
define double @test_intrinsic_fmin(double %x, double %y) {
-; SSE-LABEL: test_intrinsic_fmin:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: cmpunordsd %xmm0, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm3
-; SSE-NEXT: andpd %xmm1, %xmm3
-; SSE-NEXT: minsd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm1, %xmm2
-; SSE-NEXT: orpd %xmm3, %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_intrinsic_fmin:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm3
+; SSE2-NEXT: andpd %xmm1, %xmm3
+; SSE2-NEXT: minsd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm3, %xmm2
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: test_intrinsic_fmin:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movapd %xmm1, %xmm2
+; SSE4-NEXT: minsd %xmm0, %xmm2
+; SSE4-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: test_intrinsic_fmin:
; AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/fp-select-cmp-and.ll b/llvm/test/CodeGen/X86/fp-select-cmp-and.ll
index 0f6159d36ea818a..1d006f725ca34d5 100644
--- a/llvm/test/CodeGen/X86/fp-select-cmp-and.ll
+++ b/llvm/test/CodeGen/X86/fp-select-cmp-and.ll
@@ -189,10 +189,9 @@ define float @test17(float %a, float %b, float %c, float %eps) {
; CHECK-LABEL: test17:
; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm0, %xmm3
-; CHECK-NEXT: andps %xmm3, %xmm2
-; CHECK-NEXT: andnps %xmm1, %xmm3
-; CHECK-NEXT: orps %xmm2, %xmm3
; CHECK-NEXT: movaps %xmm3, %xmm0
+; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
%cmp = fcmp oge float %a, %eps
%cond = select i1 %cmp, float %c, float %b
@@ -203,10 +202,9 @@ define double @test18(double %a, double %b, double %c, double %eps) {
; CHECK-LABEL: test18:
; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm0, %xmm3
-; CHECK-NEXT: andpd %xmm3, %xmm2
-; CHECK-NEXT: andnpd %xmm1, %xmm3
-; CHECK-NEXT: orpd %xmm2, %xmm3
; CHECK-NEXT: movapd %xmm3, %xmm0
+; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; CHECK-NEXT: movapd %xmm1, %xmm0
; CHECK-NEXT: retq
%cmp = fcmp oge double %a, %eps
%cond = select i1 %cmp, double %c, double %b
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index e723569bda8a126..f526db00df60629 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -463,14 +463,23 @@ define <2 x double> @oge(<2 x double> %x) {
; negative test - don't create an fneg to replace 0.0 operand
define double @ogt_no_fneg(double %x, double %y) {
-; CHECK-LABEL: ogt_no_fneg:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorpd %xmm2, %xmm2
-; CHECK-NEXT: cmpltsd %xmm0, %xmm2
-; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: andnpd %xmm1, %xmm2
-; CHECK-NEXT: orpd %xmm2, %xmm0
-; CHECK-NEXT: retq
+; SSE2-LABEL: ogt_no_fneg:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: cmpltsd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: ogt_no_fneg:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: xorpd %xmm0, %xmm0
+; SSE41-NEXT: cmpltsd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%cmp = fcmp ogt double %x, 0.0
%r = select i1 %cmp, double %x, double %y
ret double %r
@@ -479,16 +488,27 @@ define double @ogt_no_fneg(double %x, double %y) {
; negative test - can't change the setcc for non-zero constant
define double @ogt_no_zero(double %x) {
-; CHECK-LABEL: ogt_no_zero:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
-; CHECK-NEXT: xorpd %xmm0, %xmm1
-; CHECK-NEXT: movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
-; CHECK-NEXT: cmpltsd %xmm0, %xmm2
-; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: andnpd %xmm1, %xmm2
-; CHECK-NEXT: orpd %xmm2, %xmm0
-; CHECK-NEXT: retq
+; SSE2-LABEL: ogt_no_zero:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
+; SSE2-NEXT: xorpd %xmm0, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
+; SSE2-NEXT: cmpltsd %xmm0, %xmm2
+; SSE2-NEXT: andpd %xmm2, %xmm0
+; SSE2-NEXT: andnpd %xmm1, %xmm2
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: ogt_no_zero:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm1
+; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; SSE41-NEXT: xorpd %xmm0, %xmm2
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; SSE41-NEXT: cmpltsd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: retq
%neg = fneg double %x
%cmp = fcmp ogt double %x, 1.0
%r = select i1 %cmp, double %x, double %neg
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 65cde6ac91106bd..abb85ac83464cb2 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1679,10 +1679,9 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo
; SSE41: ## %bb.0:
; SSE41-NEXT: roundss $9, %xmm0, %xmm3
; SSE41-NEXT: cmpeqss %xmm1, %xmm0
-; SSE41-NEXT: andps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_mask_ss_mask8:
@@ -1747,10 +1746,9 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x
; SSE41: ## %bb.0:
; SSE41-NEXT: roundsd $9, %xmm0, %xmm3
; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
-; SSE41-NEXT: andpd %xmm0, %xmm3
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_mask_sd_mask8:
@@ -2671,10 +2669,9 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa
; SSE41: ## %bb.0:
; SSE41-NEXT: roundss $10, %xmm0, %xmm3
; SSE41-NEXT: cmpeqss %xmm1, %xmm0
-; SSE41-NEXT: andps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_mask_ss_mask8:
@@ -2739,10 +2736,9 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d
; SSE41: ## %bb.0:
; SSE41-NEXT: roundsd $10, %xmm0, %xmm3
; SSE41-NEXT: cmpeqsd %xmm1, %xmm0
-; SSE41-NEXT: andpd %xmm0, %xmm3
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
-; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_mask_sd_mask8:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
index fe2c41f57cfab16..7048b98227620f1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
@@ -36,13 +36,10 @@ define float @test_v2f32(<2 x float> %a0) {
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: cmpunordss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm1
-; SSE41-NEXT: orps %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: cmpunordss %xmm0, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -89,21 +86,19 @@ define float @test_v3f32(<3 x float> %a0) {
;
; SSE41-LABEL: test_v3f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: cmpunordss %xmm0, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm1
-; SSE41-NEXT: andnps %xmm1, %xmm2
-; SSE41-NEXT: orps %xmm3, %xmm2
-; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: maxss %xmm0, %xmm2
+; SSE41-NEXT: cmpunordss %xmm0, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: maxss %xmm2, %xmm3
; SSE41-NEXT: cmpunordss %xmm2, %xmm2
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: andnps %xmm1, %xmm2
-; SSE41-NEXT: orps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v3f32:
@@ -166,31 +161,26 @@ define float @test_v4f32(<4 x float> %a0) {
;
; SSE41-LABEL: test_v4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: maxss %xmm1, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: maxss %xmm0, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxss %xmm0, %xmm1
; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: maxss %xmm1, %xmm2
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
@@ -266,35 +256,30 @@ define float @test_v8f32(<8 x float> %a0) {
;
; SSE41-LABEL: test_v8f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: maxps %xmm0, %xmm2
-; SSE41-NEXT: cmpunordps %xmm0, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm2, %xmm1
-; SSE41-NEXT: andnps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT: maxps %xmm0, %xmm3
+; SSE41-NEXT: cmpunordps %xmm0, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: maxss %xmm0, %xmm1
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: maxss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: maxss %xmm1, %xmm2
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@@ -458,36 +443,31 @@ define float @test_v16f32(<16 x float> %a0) {
; SSE41-NEXT: cmpunordps %xmm1, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: maxps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: maxps %xmm4, %xmm3
; SSE41-NEXT: cmpunordps %xmm4, %xmm4
; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; SSE41-NEXT: movaps %xmm4, %xmm2
; SSE41-NEXT: maxss %xmm1, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: maxss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: maxss %xmm0, %xmm2
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: maxss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@@ -664,19 +644,30 @@ define float @test_v16f32(<16 x float> %a0) {
;
define double @test_v2f64(<2 x double> %a0) {
-; SSE-LABEL: test_v2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: cmpunordsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm3
-; SSE-NEXT: andpd %xmm2, %xmm3
-; SSE-NEXT: maxsd %xmm0, %xmm2
-; SSE-NEXT: andnpd %xmm2, %xmm1
-; SSE-NEXT: orpd %xmm3, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm3
+; SSE2-NEXT: andpd %xmm2, %xmm3
+; SSE2-NEXT: maxsd %xmm0, %xmm2
+; SSE2-NEXT: andnpd %xmm2, %xmm1
+; SSE2-NEXT: orpd %xmm3, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm0, %xmm1
+; SSE41-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
@@ -724,15 +715,14 @@ define double @test_v4f64(<4 x double> %a0) {
; SSE41-NEXT: maxpd %xmm0, %xmm2
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: maxsd %xmm2, %xmm1
-; SSE41-NEXT: andnpd %xmm1, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f64:
@@ -820,15 +810,14 @@ define double @test_v8f64(<8 x double> %a0) {
; SSE41-NEXT: cmpunordpd %xmm4, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: movapd %xmm3, %xmm2
; SSE41-NEXT: maxsd %xmm1, %xmm2
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f64:
@@ -1012,15 +1001,14 @@ define double @test_v16f64(<16 x double> %a0) {
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: movapd %xmm3, %xmm2
; SSE41-NEXT: maxsd %xmm1, %xmm2
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f64:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
index ec41657d2f248f1..008e3e4c217cb8a 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -49,22 +49,19 @@ define float @test_v2f32(<2 x float> %a0) {
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: js .LBB1_1
+; SSE41-NEXT: # %bb.2:
; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: js .LBB1_2
-; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: jmp .LBB1_3
+; SSE41-NEXT: .LBB1_1:
; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: .LBB1_2:
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: cmpunordss %xmm3, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: js .LBB1_4
-; SSE41-NEXT: # %bb.3:
; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: .LBB1_4:
-; SSE41-NEXT: maxss %xmm2, %xmm3
-; SSE41-NEXT: andnps %xmm3, %xmm1
-; SSE41-NEXT: orps %xmm4, %xmm1
+; SSE41-NEXT: .LBB1_3:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: cmpunordss %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -171,65 +168,57 @@ define float @test_v4f32(<4 x float> %a0) {
;
; SSE41-LABEL: test_v4f32:
; SSE41: # %bb.0:
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm0, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: js .LBB2_2
-; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: js .LBB2_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: jmp .LBB2_3
+; SSE41-NEXT: .LBB2_1:
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: .LBB2_3:
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; SSE41-NEXT: movaps %xmm3, %xmm4
-; SSE41-NEXT: .LBB2_2:
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE41-NEXT: movaps %xmm4, %xmm2
-; SSE41-NEXT: cmpunordss %xmm4, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm5
-; SSE41-NEXT: andps %xmm4, %xmm5
-; SSE41-NEXT: js .LBB2_4
-; SSE41-NEXT: # %bb.3:
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: .LBB2_4:
-; SSE41-NEXT: maxss %xmm3, %xmm4
-; SSE41-NEXT: andnps %xmm4, %xmm2
-; SSE41-NEXT: orps %xmm5, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
+; SSE41-NEXT: maxss %xmm0, %xmm4
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT: movd %xmm4, %eax
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm4
-; SSE41-NEXT: js .LBB2_6
+; SSE41-NEXT: js .LBB2_4
; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm1, %xmm4
+; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: jmp .LBB2_6
+; SSE41-NEXT: .LBB2_4:
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm2
; SSE41-NEXT: .LBB2_6:
-; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE41-NEXT: movaps %xmm4, %xmm3
-; SSE41-NEXT: cmpunordss %xmm4, %xmm3
-; SSE41-NEXT: movaps %xmm3, %xmm5
-; SSE41-NEXT: andps %xmm4, %xmm5
-; SSE41-NEXT: js .LBB2_8
-; SSE41-NEXT: # %bb.7:
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: .LBB2_8:
-; SSE41-NEXT: maxss %xmm1, %xmm4
-; SSE41-NEXT: andnps %xmm4, %xmm3
-; SSE41-NEXT: orps %xmm5, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: cmpunordss %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm3, %xmm2
-; SSE41-NEXT: js .LBB2_10
-; SSE41-NEXT: # %bb.9:
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: .LBB2_10:
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: cmpunordss %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm4
-; SSE41-NEXT: andps %xmm2, %xmm4
-; SSE41-NEXT: js .LBB2_12
-; SSE41-NEXT: # %bb.11:
+; SSE41-NEXT: js .LBB2_7
+; SSE41-NEXT: # %bb.8:
; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: .LBB2_12:
+; SSE41-NEXT: jmp .LBB2_9
+; SSE41-NEXT: .LBB2_7:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: .LBB2_9:
+; SSE41-NEXT: movaps %xmm1, %xmm2
; SSE41-NEXT: maxss %xmm0, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm1
-; SSE41-NEXT: orps %xmm4, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
@@ -410,61 +399,53 @@ define float @test_v8f32(<8 x float> %a0) {
; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: js .LBB3_2
-; SSE41-NEXT: # %bb.1:
+; SSE41-NEXT: js .LBB3_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: jmp .LBB3_3
+; SSE41-NEXT: .LBB3_1:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: .LBB3_3:
; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: .LBB3_2:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordss %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: js .LBB3_4
-; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: .LBB3_4:
-; SSE41-NEXT: maxss %xmm1, %xmm3
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: js .LBB3_6
+; SSE41-NEXT: js .LBB3_4
; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: jmp .LBB3_6
+; SSE41-NEXT: .LBB3_4:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm3, %xmm1
; SSE41-NEXT: .LBB3_6:
-; SSE41-NEXT: movaps %xmm4, %xmm1
-; SSE41-NEXT: cmpunordss %xmm4, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm5
-; SSE41-NEXT: andps %xmm4, %xmm5
-; SSE41-NEXT: js .LBB3_8
-; SSE41-NEXT: # %bb.7:
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: .LBB3_8:
-; SSE41-NEXT: maxss %xmm3, %xmm4
-; SSE41-NEXT: andnps %xmm4, %xmm1
-; SSE41-NEXT: orps %xmm5, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: movaps %xmm1, %xmm3
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: js .LBB3_10
-; SSE41-NEXT: # %bb.9:
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: .LBB3_10:
+; SSE41-NEXT: js .LBB3_7
+; SSE41-NEXT: # %bb.8:
; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordss %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: js .LBB3_12
-; SSE41-NEXT: # %bb.11:
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: .LBB3_12:
-; SSE41-NEXT: maxss %xmm2, %xmm3
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: jmp .LBB3_9
+; SSE41-NEXT: .LBB3_7:
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: .LBB3_9:
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: cmpunordss %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@@ -747,73 +728,65 @@ define float @test_v16f32(<16 x float> %a0) {
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: cmpunordps %xmm2, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm2
+; SSE41-NEXT: movaps %xmm1, %xmm3
; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movaps %xmm5, %xmm1
-; SSE41-NEXT: maxps %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm5, %xmm2
+; SSE41-NEXT: maxps %xmm3, %xmm2
; SSE41-NEXT: movaps %xmm5, %xmm0
; SSE41-NEXT: cmpunordps %xmm5, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: movd %xmm1, %eax
+; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT: movd %xmm2, %eax
; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: js .LBB4_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: jmp .LBB4_3
+; SSE41-NEXT: .LBB4_1:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: .LBB4_3:
; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: js .LBB4_2
-; SSE41-NEXT: # %bb.1:
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: .LBB4_2:
-; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordss %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: js .LBB4_4
-; SSE41-NEXT: # %bb.3:
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: .LBB4_4:
-; SSE41-NEXT: maxss %xmm2, %xmm3
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: js .LBB4_6
+; SSE41-NEXT: js .LBB4_4
; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: jmp .LBB4_6
+; SSE41-NEXT: .LBB4_4:
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm3, %xmm1
; SSE41-NEXT: .LBB4_6:
-; SSE41-NEXT: movaps %xmm4, %xmm2
-; SSE41-NEXT: cmpunordss %xmm4, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm5
-; SSE41-NEXT: andps %xmm4, %xmm5
-; SSE41-NEXT: js .LBB4_8
-; SSE41-NEXT: # %bb.7:
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: .LBB4_8:
-; SSE41-NEXT: maxss %xmm3, %xmm4
-; SSE41-NEXT: andnps %xmm4, %xmm2
-; SSE41-NEXT: orps %xmm5, %xmm2
-; SSE41-NEXT: movd %xmm2, %eax
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: testl %eax, %eax
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: js .LBB4_10
-; SSE41-NEXT: # %bb.9:
; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: .LBB4_10:
+; SSE41-NEXT: maxss %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movd %xmm3, %eax
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT: testl %eax, %eax
+; SSE41-NEXT: js .LBB4_7
+; SSE41-NEXT: # %bb.8:
; SSE41-NEXT: movaps %xmm3, %xmm0
-; SSE41-NEXT: cmpunordss %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: js .LBB4_12
-; SSE41-NEXT: # %bb.11:
+; SSE41-NEXT: jmp .LBB4_9
+; SSE41-NEXT: .LBB4_7:
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: movaps %xmm3, %xmm2
+; SSE41-NEXT: .LBB4_9:
; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: .LBB4_12:
-; SSE41-NEXT: maxss %xmm1, %xmm3
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
+; SSE41-NEXT: maxss %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: cmpunordss %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@@ -986,30 +959,52 @@ define float @test_v16f32(<16 x float> %a0) {
;
define double @test_v2f64(<2 x double> %a0) {
-; SSE-LABEL: test_v2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: testq %rax, %rax
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: js .LBB5_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: .LBB5_2:
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: cmpunordsd %xmm3, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: andpd %xmm3, %xmm4
-; SSE-NEXT: js .LBB5_4
-; SSE-NEXT: # %bb.3:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: .LBB5_4:
-; SSE-NEXT: maxsd %xmm2, %xmm3
-; SSE-NEXT: andnpd %xmm3, %xmm1
-; SSE-NEXT: orpd %xmm4, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT: movq %xmm0, %rax
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: js .LBB5_2
+; SSE2-NEXT: # %bb.1:
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: .LBB5_2:
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: cmpunordsd %xmm3, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm4
+; SSE2-NEXT: andpd %xmm3, %xmm4
+; SSE2-NEXT: js .LBB5_4
+; SSE2-NEXT: # %bb.3:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: .LBB5_4:
+; SSE2-NEXT: maxsd %xmm2, %xmm3
+; SSE2-NEXT: andnpd %xmm3, %xmm1
+; SSE2-NEXT: orpd %xmm4, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE41-NEXT: movq %xmm0, %rax
+; SSE41-NEXT: testq %rax, %rax
+; SSE41-NEXT: js .LBB5_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: jmp .LBB5_3
+; SSE41-NEXT: .LBB5_1:
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: .LBB5_3:
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
@@ -1092,34 +1087,32 @@ define double @test_v4f64(<4 x double> %a0) {
;
; SSE41-LABEL: test_v4f64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm0, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: maxpd %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: maxpd %xmm2, %xmm3
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: cmpunordpd %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE41-NEXT: movq %xmm2, %rax
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; SSE41-NEXT: movq %xmm3, %rax
; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: js .LBB6_2
-; SSE41-NEXT: # %bb.1:
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: .LBB6_2:
+; SSE41-NEXT: js .LBB6_1
+; SSE41-NEXT: # %bb.2:
; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm4
-; SSE41-NEXT: andpd %xmm3, %xmm4
-; SSE41-NEXT: js .LBB6_4
-; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: jmp .LBB6_3
+; SSE41-NEXT: .LBB6_1:
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: movapd %xmm3, %xmm2
+; SSE41-NEXT: .LBB6_3:
; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: .LBB6_4:
-; SSE41-NEXT: maxsd %xmm1, %xmm3
-; SSE41-NEXT: andnpd %xmm3, %xmm0
-; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: maxsd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f64:
@@ -1310,22 +1303,20 @@ define double @test_v8f64(<8 x double> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: js .LBB7_2
-; SSE41-NEXT: # %bb.1:
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: .LBB7_2:
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm4
-; SSE41-NEXT: andpd %xmm3, %xmm4
-; SSE41-NEXT: js .LBB7_4
-; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: js .LBB7_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: jmp .LBB7_3
+; SSE41-NEXT: .LBB7_1:
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: .LBB7_4:
-; SSE41-NEXT: maxsd %xmm2, %xmm3
-; SSE41-NEXT: andnpd %xmm3, %xmm0
-; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: .LBB7_3:
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f64:
@@ -1646,22 +1637,20 @@ define double @test_v16f64(<16 x double> %a0) {
; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE41-NEXT: movq %xmm1, %rax
; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: js .LBB8_2
-; SSE41-NEXT: # %bb.1:
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: .LBB8_2:
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm4
-; SSE41-NEXT: andpd %xmm3, %xmm4
-; SSE41-NEXT: js .LBB8_4
-; SSE41-NEXT: # %bb.3:
+; SSE41-NEXT: js .LBB8_1
+; SSE41-NEXT: # %bb.2:
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: jmp .LBB8_3
+; SSE41-NEXT: .LBB8_1:
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: .LBB8_4:
-; SSE41-NEXT: maxsd %xmm2, %xmm3
-; SSE41-NEXT: andnpd %xmm3, %xmm0
-; SSE41-NEXT: orpd %xmm4, %xmm0
+; SSE41-NEXT: .LBB8_3:
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: maxsd %xmm0, %xmm1
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f64:
@@ -1792,3 +1781,5 @@ declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 5ae9e552d0dcda4..727af12217c6781 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
@@ -36,13 +36,10 @@ define float @test_v2f32(<2 x float> %a0) {
; SSE41-LABEL: test_v2f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: cmpunordss %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm3
-; SSE41-NEXT: minss %xmm0, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm1
-; SSE41-NEXT: orps %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: minss %xmm0, %xmm1
+; SSE41-NEXT: cmpunordss %xmm0, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -99,31 +96,26 @@ define float @test_v4f32(<4 x float> %a0) {
;
; SSE41-LABEL: test_v4f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm0, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andps %xmm3, %xmm4
-; SSE41-NEXT: minss %xmm1, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; SSE41-NEXT: andnps %xmm3, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: minss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: minss %xmm0, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: minss %xmm0, %xmm1
; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: minss %xmm1, %xmm2
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: minss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
@@ -199,35 +191,30 @@ define float @test_v8f32(<8 x float> %a0) {
;
; SSE41-LABEL: test_v8f32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: minps %xmm0, %xmm2
-; SSE41-NEXT: cmpunordps %xmm0, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: cmpunordss %xmm2, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm3
-; SSE41-NEXT: minss %xmm2, %xmm1
-; SSE41-NEXT: andnps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: minss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE41-NEXT: minps %xmm0, %xmm3
+; SSE41-NEXT: cmpunordps %xmm0, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: minss %xmm0, %xmm1
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm1, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: minss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; SSE41-NEXT: movaps %xmm4, %xmm2
+; SSE41-NEXT: minss %xmm1, %xmm2
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: minss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f32:
@@ -391,36 +378,31 @@ define float @test_v16f32(<16 x float> %a0) {
; SSE41-NEXT: cmpunordps %xmm1, %xmm1
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm1
-; SSE41-NEXT: minps %xmm4, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm3
+; SSE41-NEXT: minps %xmm4, %xmm3
; SSE41-NEXT: cmpunordps %xmm4, %xmm4
; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: minss %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm3, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: cmpunordss %xmm1, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andps %xmm2, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm4
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; SSE41-NEXT: movaps %xmm4, %xmm2
; SSE41-NEXT: minss %xmm1, %xmm2
-; SSE41-NEXT: andnps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movaps %xmm2, %xmm3
-; SSE41-NEXT: minss %xmm0, %xmm3
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: andnps %xmm3, %xmm4
-; SSE41-NEXT: andps %xmm2, %xmm0
-; SSE41-NEXT: orps %xmm4, %xmm0
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSE41-NEXT: movaps %xmm1, %xmm2
-; SSE41-NEXT: minss %xmm0, %xmm2
-; SSE41-NEXT: cmpunordss %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, %xmm3
-; SSE41-NEXT: andnps %xmm2, %xmm3
-; SSE41-NEXT: andps %xmm1, %xmm0
-; SSE41-NEXT: orps %xmm3, %xmm0
+; SSE41-NEXT: cmpunordss %xmm1, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm2
+; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE41-NEXT: movaps %xmm3, %xmm1
+; SSE41-NEXT: minss %xmm2, %xmm1
+; SSE41-NEXT: cmpunordss %xmm2, %xmm2
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f32:
@@ -597,19 +579,30 @@ define float @test_v16f32(<16 x float> %a0) {
;
define double @test_v2f64(<2 x double> %a0) {
-; SSE-LABEL: test_v2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: cmpunordsd %xmm0, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm3
-; SSE-NEXT: andpd %xmm2, %xmm3
-; SSE-NEXT: minsd %xmm0, %xmm2
-; SSE-NEXT: andnpd %xmm2, %xmm1
-; SSE-NEXT: orpd %xmm3, %xmm1
-; SSE-NEXT: movapd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: test_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE2-NEXT: movapd %xmm0, %xmm1
+; SSE2-NEXT: cmpunordsd %xmm0, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm3
+; SSE2-NEXT: andpd %xmm2, %xmm3
+; SSE2-NEXT: minsd %xmm0, %xmm2
+; SSE2-NEXT: andnpd %xmm2, %xmm1
+; SSE2-NEXT: orpd %xmm3, %xmm1
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movapd %xmm0, %xmm2
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE41-NEXT: movapd %xmm2, %xmm1
+; SSE41-NEXT: minsd %xmm0, %xmm1
+; SSE41-NEXT: cmpunordsd %xmm0, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f64:
; AVX: # %bb.0:
@@ -661,15 +654,14 @@ define double @test_v3f64(<3 x double> %a0) {
; SSE41-NEXT: minpd %xmm0, %xmm1
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: movapd %xmm3, %xmm2
; SSE41-NEXT: minsd %xmm1, %xmm2
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v3f64:
@@ -727,15 +719,14 @@ define double @test_v4f64(<4 x double> %a0) {
; SSE41-NEXT: minpd %xmm0, %xmm2
; SSE41-NEXT: cmpunordpd %xmm0, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm2, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE41-NEXT: movapd %xmm3, %xmm1
; SSE41-NEXT: minsd %xmm2, %xmm1
-; SSE41-NEXT: andnpd %xmm1, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm2, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f64:
@@ -823,15 +814,14 @@ define double @test_v8f64(<8 x double> %a0) {
; SSE41-NEXT: cmpunordpd %xmm4, %xmm4
; SSE41-NEXT: movapd %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: movapd %xmm3, %xmm2
; SSE41-NEXT: minsd %xmm1, %xmm2
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8f64:
@@ -1015,15 +1005,14 @@ define double @test_v16f64(<16 x double> %a0) {
; SSE41-NEXT: cmpunordpd %xmm2, %xmm2
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE41-NEXT: movapd %xmm1, %xmm0
-; SSE41-NEXT: cmpunordsd %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm0, %xmm3
-; SSE41-NEXT: andpd %xmm2, %xmm3
+; SSE41-NEXT: movapd %xmm1, %xmm3
+; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE41-NEXT: movapd %xmm3, %xmm2
; SSE41-NEXT: minsd %xmm1, %xmm2
-; SSE41-NEXT: andnpd %xmm2, %xmm0
-; SSE41-NEXT: orpd %xmm3, %xmm0
+; SSE41-NEXT: cmpunordsd %xmm1, %xmm1
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16f64:
diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index 50040593848a2c1..b3bb01137c70db4 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -113,14 +113,22 @@ define float @fsel_zero_true_val(float %a, float %b, float %x) {
}
define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
-; SSE-LABEL: fsel_nonzero_false_val:
-; SSE: # %bb.0:
-; SSE-NEXT: cmpeqsd %xmm1, %xmm0
-; SSE-NEXT: andpd %xmm0, %xmm2
-; SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
-; SSE-NEXT: andnpd %xmm1, %xmm0
-; SSE-NEXT: orpd %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: fsel_nonzero_false_val:
+; SSE2: # %bb.0:
+; SSE2-NEXT: cmpeqsd %xmm1, %xmm0
+; SSE2-NEXT: andpd %xmm0, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
+; SSE2-NEXT: andnpd %xmm1, %xmm0
+; SSE2-NEXT: orpd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsel_nonzero_false_val:
+; SSE42: # %bb.0:
+; SSE42-NEXT: cmpeqsd %xmm1, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1]
+; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; SSE42-NEXT: movapd %xmm1, %xmm0
+; SSE42-NEXT: retq
;
; AVX-LABEL: fsel_nonzero_false_val:
; AVX: # %bb.0:
@@ -142,14 +150,21 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
}
define double @fsel_nonzero_true_val(double %x, double %y, double %z) {
-; SSE-LABEL: fsel_nonzero_true_val:
-; SSE: # %bb.0:
-; SSE-NEXT: cmpeqsd %xmm1, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
-; SSE-NEXT: andpd %xmm0, %xmm1
-; SSE-NEXT: andnpd %xmm2, %xmm0
-; SSE-NEXT: orpd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: fsel_nonzero_true_val:
+; SSE2: # %bb.0:
+; SSE2-NEXT: cmpeqsd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
+; SSE2-NEXT: andpd %xmm0, %xmm1
+; SSE2-NEXT: andnpd %xmm2, %xmm0
+; SSE2-NEXT: orpd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsel_nonzero_true_val:
+; SSE42: # %bb.0:
+; SSE42-NEXT: cmpeqsd %xmm1, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE42-NEXT: movapd %xmm2, %xmm0
+; SSE42-NEXT: retq
;
; AVX-LABEL: fsel_nonzero_true_val:
; AVX: # %bb.0:
>From f56c7688dbef5cd7e5e79b304d787f692ce6c3dc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 5 Feb 2025 22:33:29 +0000
Subject: [PATCH 2/2] [TEST] Drop SSE41 constraints
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +-
llvm/test/CodeGen/X86/sse-minmax.ll | 144 ++++++++++--------------
2 files changed, 64 insertions(+), 90 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bc55d772b86b8de..00021328ef69a56 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24633,18 +24633,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
// instead of 3 logic instructions for size savings and potentially speed.
// Unfortunately, there is no scalar form of VBLENDV.
-
+ //
// If either operand is a +0.0 constant, don't try this. We can expect to
// optimize away at least one of the logic instructions later in that
// case, so that sequence would be faster than a variable blend.
-
- // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
- // uses XMM0 as the selection register. That may need just as many
- // instructions as the AND/ANDN/OR sequence due to register moves, so
- // only attempt this if at least one of ops (+ condition) are one use.
if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
- !isNullFPConstant(Op2) &&
- (Subtarget.hasAVX() || Op1->hasOneUse() || Op2->hasOneUse())) {
+ !isNullFPConstant(Op2)) {
// Convert to vectors, do a VSELECT, and convert back to scalar.
// All of the conversions should be optimized away.
MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
diff --git a/llvm/test/CodeGen/X86/sse-minmax.ll b/llvm/test/CodeGen/X86/sse-minmax.ll
index 1c14b7400a35859..7904b21a3b1faee 100644
--- a/llvm/test/CodeGen/X86/sse-minmax.ll
+++ b/llvm/test/CodeGen/X86/sse-minmax.ll
@@ -80,11 +80,11 @@ define double @olt_inverse(double %x, double %y) {
define double @oge(double %x, double %y) {
; STRICT-LABEL: oge:
; STRICT: # %bb.0:
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmplesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: cmplesd %xmm2, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: oge:
@@ -100,10 +100,9 @@ define double @ole(double %x, double %y) {
; STRICT-LABEL: ole:
; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmplesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: cmplesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole:
@@ -118,11 +117,10 @@ define double @ole(double %x, double %y) {
define double @oge_inverse(double %x, double %y) {
; STRICT-LABEL: oge_inverse:
; STRICT: # %bb.0:
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmplesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm1
-; STRICT-NEXT: andnpd %xmm0, %xmm2
-; STRICT-NEXT: orpd %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: cmplesd %xmm2, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
@@ -145,10 +143,8 @@ define double @ole_inverse(double %x, double %y) {
; STRICT-LABEL: ole_inverse:
; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmplesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm1
-; STRICT-NEXT: andnpd %xmm0, %xmm2
-; STRICT-NEXT: orpd %xmm1, %xmm2
+; STRICT-NEXT: cmplesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
@@ -333,10 +329,9 @@ define double @ugt(double %x, double %y) {
; STRICT-LABEL: ugt:
; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt:
@@ -351,11 +346,11 @@ define double @ugt(double %x, double %y) {
define double @ult(double %x, double %y) {
; STRICT-LABEL: ult:
; STRICT: # %bb.0:
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: cmpnlesd %xmm2, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ult:
@@ -371,10 +366,8 @@ define double @ugt_inverse(double %x, double %y) {
; STRICT-LABEL: ugt_inverse:
; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm1
-; STRICT-NEXT: andnpd %xmm0, %xmm2
-; STRICT-NEXT: orpd %xmm1, %xmm2
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
@@ -396,11 +389,10 @@ define double @ugt_inverse(double %x, double %y) {
define double @ult_inverse(double %x, double %y) {
; STRICT-LABEL: ult_inverse:
; STRICT: # %bb.0:
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm1
-; STRICT-NEXT: andnpd %xmm0, %xmm2
-; STRICT-NEXT: orpd %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm0, %xmm2
+; STRICT-NEXT: movapd %xmm1, %xmm0
+; STRICT-NEXT: cmpnlesd %xmm2, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
@@ -738,12 +730,12 @@ define double @olt_inverse_y(double %x) {
define double @oge_y(double %x) {
; STRICT-LABEL: oge_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmplesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; STRICT-NEXT: cmplesd %xmm1, %xmm0
+; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: oge_y:
@@ -758,12 +750,11 @@ define double @oge_y(double %x) {
define double @ole_y(double %x) {
; STRICT-LABEL: ole_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmplesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: cmplesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole_y:
@@ -778,12 +769,10 @@ define double @ole_y(double %x) {
define double @oge_inverse_y(double %x) {
; STRICT-LABEL: oge_inverse_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm2, %xmm1
-; STRICT-NEXT: cmplesd %xmm0, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm2
-; STRICT-NEXT: andnpd %xmm0, %xmm1
-; STRICT-NEXT: orpd %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; STRICT-NEXT: cmplesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
@@ -806,12 +795,9 @@ define double @oge_inverse_y(double %x) {
define double @ole_inverse_y(double %x) {
; STRICT-LABEL: ole_inverse_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmplesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm2
-; STRICT-NEXT: andnpd %xmm0, %xmm1
-; STRICT-NEXT: orpd %xmm2, %xmm1
+; STRICT-NEXT: cmplesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
@@ -834,12 +820,11 @@ define double @ole_inverse_y(double %x) {
define double @ugt_y(double %x) {
; STRICT-LABEL: ugt_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm0, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: cmpnlesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt_y:
@@ -854,12 +839,12 @@ define double @ugt_y(double %x) {
define double @ult_y(double %x) {
; STRICT-LABEL: ult_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm1, %xmm2
-; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT: andpd %xmm2, %xmm0
-; STRICT-NEXT: andnpd %xmm1, %xmm2
-; STRICT-NEXT: orpd %xmm2, %xmm0
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm0
+; STRICT-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; STRICT-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; STRICT-NEXT: movapd %xmm2, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ult_y:
@@ -874,12 +859,9 @@ define double @ult_y(double %x) {
define double @ugt_inverse_y(double %x) {
; STRICT-LABEL: ugt_inverse_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
; STRICT-NEXT: movapd %xmm0, %xmm1
-; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm2
-; STRICT-NEXT: andnpd %xmm0, %xmm1
-; STRICT-NEXT: orpd %xmm2, %xmm1
+; STRICT-NEXT: cmpnlesd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
@@ -902,12 +884,10 @@ define double @ugt_inverse_y(double %x) {
define double @ult_inverse_y(double %x) {
; STRICT-LABEL: ult_inverse_y:
; STRICT: # %bb.0:
-; STRICT-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
-; STRICT-NEXT: movapd %xmm2, %xmm1
-; STRICT-NEXT: cmpnlesd %xmm0, %xmm1
-; STRICT-NEXT: andpd %xmm1, %xmm2
-; STRICT-NEXT: andnpd %xmm0, %xmm1
-; STRICT-NEXT: orpd %xmm2, %xmm1
+; STRICT-NEXT: movapd %xmm0, %xmm1
+; STRICT-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
+; STRICT-NEXT: cmpnlesd %xmm1, %xmm0
+; STRICT-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
More information about the llvm-commits
mailing list