[llvm] 05a4cf2 - [X86] Autogenerate complete checks. NFC

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 3 17:36:27 PST 2020


Author: Craig Topper
Date: 2020-01-03T17:18:18-08:00
New Revision: 05a4cf26365f10ae0cb2ad76f2babfb5ed929fdc

URL: https://github.com/llvm/llvm-project/commit/05a4cf26365f10ae0cb2ad76f2babfb5ed929fdc
DIFF: https://github.com/llvm/llvm-project/commit/05a4cf26365f10ae0cb2ad76f2babfb5ed929fdc.diff

LOG: [X86] Autogenerate complete checks. NFC

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/vec-strict-128-cmp.ll
    llvm/test/CodeGen/X86/vec-strict-256-cmp.ll
    llvm/test/CodeGen/X86/vec-strict-512-cmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/vec-strict-128-cmp.ll b/llvm/test/CodeGen/X86/vec-strict-128-cmp.ll
index 855b312f3d5b..3b56e0114915 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-cmp.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-cmp.ll
@@ -1,32 +1,69 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE,SSE-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=CHECK,SSE,SSE-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-64
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-64
 
 define <4 x i32> @test_v4f32_oeq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_oeq_q:
-; SSE:       # %bb.0:
-; SSE:         cmpeqps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_oeq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_oeq_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpeqps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_oeq_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpeqps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_oeq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_oeq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -35,29 +72,123 @@ define <4 x i32> @test_v4f32_oeq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ogt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm3, %xmm2
-; SSE:         ucomiss %xmm3, %xmm2
-;
-; AVX-LABEL: test_v4f32_ogt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ogt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movaps %xmm2, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    cmoval %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ogt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm3, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-64-NEXT:    movaps %xmm2, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    cmoval %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ogt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmplt_oqps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ogt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgt_oqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -66,29 +197,123 @@ define <4 x i32> @test_v4f32_ogt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_oge_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm3, %xmm2
-; SSE:         ucomiss %xmm3, %xmm2
-;
-; AVX-LABEL: test_v4f32_oge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_oge_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movaps %xmm2, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    cmovael %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_oge_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm3, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-64-NEXT:    movaps %xmm2, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    cmovael %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_oge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmple_oqps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_oge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpge_oqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -97,29 +322,122 @@ define <4 x i32> @test_v4f32_oge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_olt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm2, %xmm3
-; SSE:         ucomiss %xmm2, %xmm3
-;
-; AVX-LABEL: test_v4f32_olt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_olt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movaps %xmm3, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    cmoval %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_olt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-64-NEXT:    movaps %xmm3, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmoval %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    cmoval %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_olt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmplt_oqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_olt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmplt_oqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -128,29 +446,122 @@ define <4 x i32> @test_v4f32_olt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ole_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm2, %xmm3
-; SSE:         ucomiss %xmm2, %xmm3
-;
-; AVX-LABEL: test_v4f32_ole_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ole_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movaps %xmm3, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    cmovael %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ole_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-64-NEXT:    movaps %xmm3, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovael %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    cmovael %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ole_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmple_oqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ole_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmple_oqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -159,29 +570,70 @@ define <4 x i32> @test_v4f32_ole_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_one_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_one_q:
-; SSE:       # %bb.0:
-; SSE:         cmpneqps %xmm3, %xmm4
-; SSE-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm4, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_one_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_oqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_one_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpneqps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm4, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_one_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpneqps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm4, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_one_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_oqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_one_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_oqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_oqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -190,27 +642,63 @@ define <4 x i32> @test_v4f32_one_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ord_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ord_q:
-; SSE:       # %bb.0:
-; SSE:         cmpordps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ord_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpordps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ord_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpordps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ord_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ord_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpordps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ord_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpordps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpordps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -219,29 +707,70 @@ define <4 x i32> @test_v4f32_ord_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ueq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ueq_q:
-; SSE:       # %bb.0:
-; SSE:         cmpeqps %xmm3, %xmm4
-; SSE-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-NEXT:    orps %xmm4, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ueq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ueq_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpeqps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-32-NEXT:    orps %xmm4, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ueq_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpeqps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-64-NEXT:    orps %xmm4, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ueq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_uqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ueq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_uqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -250,29 +779,122 @@ define <4 x i32> @test_v4f32_ueq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ugt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm2, %xmm3
-; SSE:         ucomiss %xmm2, %xmm3
-;
-; AVX-LABEL: test_v4f32_ugt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ugt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movaps %xmm3, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    cmovbl %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ugt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-64-NEXT:    movaps %xmm3, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    cmovbl %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ugt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnle_uqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ugt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnle_uqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -281,29 +903,122 @@ define <4 x i32> @test_v4f32_ugt_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_uge_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm2, %xmm3
-; SSE:         ucomiss %xmm2, %xmm3
-;
-; AVX-LABEL: test_v4f32_uge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_uge_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-32-NEXT:    movaps %xmm3, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-32-NEXT:    cmovbel %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_uge_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3]
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm3[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1]
+; SSE-64-NEXT:    movaps %xmm3, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm2, %xmm3
+; SSE-64-NEXT:    cmovbel %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_uge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnlt_uqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_uge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -312,29 +1027,123 @@ define <4 x i32> @test_v4f32_uge_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ult_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm3, %xmm2
-; SSE:         ucomiss %xmm3, %xmm2
-;
-; AVX-LABEL: test_v4f32_ult_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ult_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movaps %xmm2, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    cmovbl %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ult_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm3, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-64-NEXT:    movaps %xmm2, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbl %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    cmovbl %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ult_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnle_uqps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ult_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnge_uqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -343,29 +1152,123 @@ define <4 x i32> @test_v4f32_ult_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ule_q:
-; SSE:       # %bb.0:
-; SSE:         ucomiss %xmm4, %xmm5
-; SSE:         unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; SSE:         unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
-; SSE:         ucomiss %xmm5, %xmm6
-; SSE:         ucomiss %xmm3, %xmm2
-; SSE:         ucomiss %xmm3, %xmm2
-;
-; AVX-LABEL: test_v4f32_ule_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ule_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm3, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-32-NEXT:    movaps %xmm2, %xmm5
+; SSE-32-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    movaps %xmm3, %xmm5
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-32-NEXT:    movaps %xmm2, %xmm6
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-32-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm5
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-32-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-32-NEXT:    cmovbel %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ule_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm3, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1],xmm3[2,3]
+; SSE-64-NEXT:    movaps %xmm2, %xmm5
+; SSE-64-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,1],xmm2[2,3]
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomiss %xmm4, %xmm5
+; SSE-64-NEXT:    movl $-1, %ecx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    movaps %xmm3, %xmm5
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSE-64-NEXT:    movaps %xmm2, %xmm6
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1]
+; SSE-64-NEXT:    ucomiss %xmm5, %xmm6
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm5
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbel %ecx, %edx
+; SSE-64-NEXT:    movd %edx, %xmm4
+; SSE-64-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; SSE-64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; SSE-64-NEXT:    ucomiss %xmm3, %xmm2
+; SSE-64-NEXT:    cmovbel %ecx, %eax
+; SSE-64-NEXT:    movd %eax, %xmm2
+; SSE-64-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ule_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnlt_uqps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ule_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngt_uqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -374,27 +1277,63 @@ define <4 x i32> @test_v4f32_ule_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_une_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_une_q:
-; SSE:       # %bb.0:
-; SSE:         cmpneqps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_une_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneqps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_une_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpneqps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_une_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpneqps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_une_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneqps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_une_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneqps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneqps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -403,27 +1342,63 @@ define <4 x i32> @test_v4f32_une_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_uno_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_uno_q:
-; SSE:       # %bb.0:
-; SSE:         cmpunordps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_uno_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpunordps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_uno_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpunordps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_uno_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_uno_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpunordps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_uno_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunordps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpunordps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -432,27 +1407,63 @@ define <4 x i32> @test_v4f32_uno_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <2 x i64> @test_v2f64_oeq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_oeq_q:
-; SSE:       # %bb.0:
-; SSE:         cmpeqpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_oeq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_oeq_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpeqpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_oeq_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_oeq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_oeq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -461,27 +1472,89 @@ define <2 x i64> @test_v2f64_oeq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ogt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm3, %xmm2
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         ucomisd %xmm3, %xmm2
-;
-; AVX-LABEL: test_v2f64_ogt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ogt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    cmoval %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ogt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovaq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    cmovaq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ogt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmplt_oqpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ogt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgt_oqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -490,27 +1563,89 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_oge_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm3, %xmm2
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         ucomisd %xmm3, %xmm2
-;
-; AVX-LABEL: test_v2f64_oge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_oge_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    cmovael %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_oge_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovaeq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    cmovaeq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_oge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmple_oqpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_oge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpge_oqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -519,27 +1654,88 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_olt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm2, %xmm3
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         ucomisd %xmm2, %xmm3
-;
-; AVX-LABEL: test_v2f64_olt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_olt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmoval %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    cmoval %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_olt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovaq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    cmovaq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_olt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmplt_oqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_olt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmplt_oqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -548,27 +1744,88 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ole_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm2, %xmm3
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         ucomisd %xmm2, %xmm3
-;
-; AVX-LABEL: test_v2f64_ole_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ole_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovael %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    cmovael %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ole_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovaeq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    cmovaeq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ole_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmple_oqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ole_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmple_oqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -577,29 +1834,70 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_one_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_one_q:
-; SSE:       # %bb.0:
-; SSE:         cmpneqpd %xmm3, %xmm4
-; SSE-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm4, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_one_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_oqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_one_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm4, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_one_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm4, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_one_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_oqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_one_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_oqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_oqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -608,27 +1906,63 @@ define <2 x i64> @test_v2f64_one_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ord_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ord_q:
-; SSE:       # %bb.0:
-; SSE:         cmpordpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ord_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpordpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ord_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpordpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ord_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ord_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpordpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ord_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpordpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpordpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -637,29 +1971,70 @@ define <2 x i64> @test_v2f64_ord_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ueq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ueq_q:
-; SSE:       # %bb.0:
-; SSE:         cmpeqpd %xmm3, %xmm4
-; SSE-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-NEXT:    orpd %xmm4, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ueq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ueq_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-32-NEXT:    orpd %xmm4, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ueq_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-64-NEXT:    orpd %xmm4, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ueq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_uqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ueq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_uqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -668,27 +2043,88 @@ define <2 x i64> @test_v2f64_ueq_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ugt_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm2, %xmm3
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         ucomisd %xmm2, %xmm3
-;
-; AVX-LABEL: test_v2f64_ugt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ugt_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    cmovbl %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ugt_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    cmovbq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ugt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnle_uqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ugt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnle_uqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -697,27 +2133,88 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_uge_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm2, %xmm3
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         ucomisd %xmm2, %xmm3
-;
-; AVX-LABEL: test_v2f64_uge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_uge_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-32-NEXT:    cmovbel %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_uge_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbeq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    ucomisd %xmm2, %xmm3
+; SSE-64-NEXT:    cmovbeq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_uge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnlt_uqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_uge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -726,27 +2223,89 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ult_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm3, %xmm2
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         ucomisd %xmm3, %xmm2
-;
-; AVX-LABEL: test_v2f64_ult_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ult_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbl %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    cmovbl %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ult_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    cmovbq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ult_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnle_uqpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ult_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnge_uqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -755,27 +2314,89 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ule_q:
-; SSE:       # %bb.0:
-; SSE:         ucomisd %xmm3, %xmm2
-; SSE:         unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; SSE:         unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE:         ucomisd %xmm3, %xmm2
-;
-; AVX-LABEL: test_v2f64_ule_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ule_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    xorl %eax, %eax
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    movl $-1, %ecx
+; SSE-32-NEXT:    movl $0, %edx
+; SSE-32-NEXT:    cmovbel %ecx, %edx
+; SSE-32-NEXT:    movd %edx, %xmm4
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-32-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-32-NEXT:    cmovbel %ecx, %eax
+; SSE-32-NEXT:    movd %eax, %xmm2
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-32-NEXT:    pand %xmm4, %xmm0
+; SSE-32-NEXT:    pandn %xmm1, %xmm4
+; SSE-32-NEXT:    por %xmm4, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ule_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    xorl %eax, %eax
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    movq $-1, %rcx
+; SSE-64-NEXT:    movl $0, %edx
+; SSE-64-NEXT:    cmovbeq %rcx, %rdx
+; SSE-64-NEXT:    movq %rdx, %xmm4
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-64-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-64-NEXT:    ucomisd %xmm3, %xmm2
+; SSE-64-NEXT:    cmovbeq %rcx, %rax
+; SSE-64-NEXT:    movq %rax, %xmm2
+; SSE-64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE-64-NEXT:    pand %xmm4, %xmm0
+; SSE-64-NEXT:    pandn %xmm1, %xmm4
+; SSE-64-NEXT:    por %xmm4, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ule_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnlt_uqpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ule_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngt_uqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -784,27 +2405,63 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_une_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_une_q:
-; SSE:       # %bb.0:
-; SSE:         cmpneqpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_une_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneqpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_une_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpneqpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_une_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_une_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneqpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_une_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneqpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneqpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -813,27 +2470,63 @@ define <2 x i64> @test_v2f64_une_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_uno_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_uno_q:
-; SSE:       # %bb.0:
-; SSE:         cmpunordpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_uno_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpunordpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_uno_q:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpunordpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_uno_q:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_uno_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpunordpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_uno_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunordpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpunordpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -842,28 +2535,68 @@ define <2 x i64> @test_v2f64_uno_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <4 x i32> @test_v4f32_oeq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_oeq_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE-NEXT:    cmpeqps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_oeq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_osps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_oeq_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpeqps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_oeq_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpeqps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_oeq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_osps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_oeq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_osps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_osps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_osps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_osps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -872,27 +2605,65 @@ define <4 x i32> @test_v4f32_oeq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ogt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ogt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps {{.*}}, %xmm3
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm3, %xmm0
-;
-; AVX-LABEL: test_v4f32_ogt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ogt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpltps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ogt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpltps %xmm2, %xmm3
+; SSE-64-NEXT:    andps %xmm3, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm3
+; SSE-64-NEXT:    orps %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ogt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpltps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ogt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgtps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -901,27 +2672,65 @@ define <4 x i32> @test_v4f32_ogt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_oge_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_oge_s:
-; SSE:       # %bb.0:
-; SSE:         cmpleps {{.*}}, %xmm3
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm3, %xmm0
-;
-; AVX-LABEL: test_v4f32_oge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpleps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_oge_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpleps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_oge_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpleps %xmm2, %xmm3
+; SSE-64-NEXT:    andps %xmm3, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm3
+; SSE-64-NEXT:    orps %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_oge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpleps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_oge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpleps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgeps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgeps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -930,27 +2739,63 @@ define <4 x i32> @test_v4f32_oge_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_olt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_olt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_olt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_olt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpltps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_olt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_olt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpltps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_olt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpltps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -959,27 +2804,63 @@ define <4 x i32> @test_v4f32_olt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ole_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ole_s:
-; SSE:       # %bb.0:
-; SSE:         cmpleps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ole_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpleps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ole_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpleps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ole_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpleps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ole_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpleps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ole_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpleps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpleps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpleps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -988,30 +2869,74 @@ define <4 x i32> @test_v4f32_ole_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_one_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_one_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE:         cmpneqps %xmm3, %xmm4
-; SSE-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm4, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_one_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_osps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_one_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpneqps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm4, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_one_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpneqps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm4, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_one_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_osps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_one_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_osps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_osps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_osps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_osps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -1020,28 +2945,68 @@ define <4 x i32> @test_v4f32_one_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ord_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ord_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE-NEXT:    cmpordps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ord_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpord_sps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ord_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ord_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ord_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpord_sps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ord_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpord_sps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_sps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpord_sps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_sps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -1050,30 +3015,74 @@ define <4 x i32> @test_v4f32_ord_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ueq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ueq_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE:         cmpeqps %xmm3, %xmm4
-; SSE-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-NEXT:    orps %xmm4, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ueq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_usps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ueq_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpeqps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-32-NEXT:    orps %xmm4, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ueq_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpeqps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-64-NEXT:    orps %xmm4, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ueq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_usps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ueq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_usps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_usps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_usps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_usps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -1082,27 +3091,63 @@ define <4 x i32> @test_v4f32_ueq_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ugt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ugt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnleps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_ugt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnleps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ugt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpnleps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ugt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnleps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ugt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnleps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ugt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnleps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnleps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnleps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -1111,27 +3156,63 @@ define <4 x i32> @test_v4f32_ugt_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_uge_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_uge_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnltps {{.*}}, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_uge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_uge_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpnltps 8(%ebp), %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_uge_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnltps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_uge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnltps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_uge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnltps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -1140,27 +3221,65 @@ define <4 x i32> @test_v4f32_uge_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ult_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ult_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnleps {{.*}}, %xmm3
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm3, %xmm0
-;
-; AVX-LABEL: test_v4f32_ult_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnleps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ult_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpnleps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ult_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnleps %xmm2, %xmm3
+; SSE-64-NEXT:    andps %xmm3, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm3
+; SSE-64-NEXT:    orps %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ult_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnleps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ult_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnleps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngeps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngeps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -1169,27 +3288,65 @@ define <4 x i32> @test_v4f32_ult_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_ule_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_ule_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnltps {{.*}}, %xmm3
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm3, %xmm0
-;
-; AVX-LABEL: test_v4f32_ule_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltps {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_ule_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpnltps %xmm2, %xmm3
+; SSE-32-NEXT:    andps %xmm3, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm3
+; SSE-32-NEXT:    orps %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_ule_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnltps %xmm2, %xmm3
+; SSE-64-NEXT:    andps %xmm3, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm3
+; SSE-64-NEXT:    orps %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_ule_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnltps %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_ule_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltps %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngtps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -1198,28 +3355,68 @@ define <4 x i32> @test_v4f32_ule_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_une_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_une_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE-NEXT:    cmpneqps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_une_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_usps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_une_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpneqps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_une_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpneqps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_une_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_usps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_une_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_usps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_usps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_usps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_usps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -1228,28 +3425,68 @@ define <4 x i32> @test_v4f32_une_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <4 x i32> @test_v4f32_uno_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 {
-; SSE-LABEL: test_v4f32_uno_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltps %xmm3, %xmm4
-; SSE-NEXT:    cmpunordps %xmm3, %xmm2
-; SSE-NEXT:    andps %xmm2, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm2
-; SSE-NEXT:    orps %xmm2, %xmm0
-;
-; AVX-LABEL: test_v4f32_uno_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpunord_sps {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v4f32_uno_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movaps 8(%ebp), %xmm3
+; SSE-32-NEXT:    movaps %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-32-NEXT:    andps %xmm2, %xmm0
+; SSE-32-NEXT:    andnps %xmm1, %xmm2
+; SSE-32-NEXT:    orps %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v4f32_uno_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movaps %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltps %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordps %xmm3, %xmm2
+; SSE-64-NEXT:    andps %xmm2, %xmm0
+; SSE-64-NEXT:    andnps %xmm1, %xmm2
+; SSE-64-NEXT:    orps %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v4f32_uno_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpunord_sps 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f32_uno_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunord_sps %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f32_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_sps 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpunord_sps 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f32_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_sps %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(
                                                <4 x float> %f1, <4 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -1258,28 +3495,68 @@ define <4 x i32> @test_v4f32_uno_s(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1,
 }
 
 define <2 x i64> @test_v2f64_oeq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_oeq_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE-NEXT:    cmpeqpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_oeq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_ospd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_oeq_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_oeq_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_oeq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_ospd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_oeq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_ospd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_ospd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_ospd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_ospd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -1288,27 +3565,65 @@ define <2 x i64> @test_v2f64_oeq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ogt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ogt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd {{.*}}, %xmm3
-; SSE-NEXT:    andpd %xmm3, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm3
-; SSE-NEXT:    orpd %xmm3, %xmm0
-;
-; AVX-LABEL: test_v2f64_ogt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ogt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpltpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ogt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpltpd %xmm2, %xmm3
+; SSE-64-NEXT:    andpd %xmm3, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm3
+; SSE-64-NEXT:    orpd %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ogt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpltpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ogt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgtpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -1317,27 +3632,65 @@ define <2 x i64> @test_v2f64_ogt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_oge_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_oge_s:
-; SSE:       # %bb.0:
-; SSE:         cmplepd {{.*}}, %xmm3
-; SSE-NEXT:    andpd %xmm3, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm3
-; SSE-NEXT:    orpd %xmm3, %xmm0
-;
-; AVX-LABEL: test_v2f64_oge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmplepd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_oge_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmplepd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_oge_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmplepd %xmm2, %xmm3
+; SSE-64-NEXT:    andpd %xmm3, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm3
+; SSE-64-NEXT:    orpd %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_oge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmplepd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_oge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplepd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgepd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpgepd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -1346,27 +3699,63 @@ define <2 x i64> @test_v2f64_oge_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_olt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_olt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_olt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_olt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpltpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_olt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_olt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpltpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_olt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpltpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -1375,27 +3764,63 @@ define <2 x i64> @test_v2f64_olt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ole_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ole_s:
-; SSE:       # %bb.0:
-; SSE:         cmplepd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ole_s:
-; AVX:       # %bb.0:
-; AVX:         vcmplepd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ole_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmplepd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ole_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmplepd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ole_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmplepd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ole_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplepd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplepd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmplepd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -1404,30 +3829,74 @@ define <2 x i64> @test_v2f64_ole_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_one_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_one_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE:         cmpneqpd %xmm3, %xmm4
-; SSE-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm4, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_one_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_ospd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_one_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm4, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_one_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm4, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_one_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_ospd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_one_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_ospd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_ospd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_ospd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_ospd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -1436,28 +3905,68 @@ define <2 x i64> @test_v2f64_one_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ord_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ord_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE-NEXT:    cmpordpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ord_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpord_spd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ord_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ord_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ord_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpord_spd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ord_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpord_spd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_spd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpord_spd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_spd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -1466,30 +3975,74 @@ define <2 x i64> @test_v2f64_ord_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ueq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ueq_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE:         cmpeqpd %xmm3, %xmm4
-; SSE-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-NEXT:    orpd %xmm4, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ueq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uspd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ueq_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpeqpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-32-NEXT:    orpd %xmm4, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ueq_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpeqpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-64-NEXT:    orpd %xmm4, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ueq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpeq_uspd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ueq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uspd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uspd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpeq_uspd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uspd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -1498,27 +4051,63 @@ define <2 x i64> @test_v2f64_ueq_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ugt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ugt_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnlepd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_ugt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlepd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ugt_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpnlepd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ugt_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnlepd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ugt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnlepd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ugt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlepd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlepd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnlepd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -1527,27 +4116,63 @@ define <2 x i64> @test_v2f64_ugt_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_uge_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_uge_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnltpd {{.*}}, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_uge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltpd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_uge_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    cmpnltpd 8(%ebp), %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_uge_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnltpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_uge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpnltpd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_uge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltpd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpnltpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -1556,27 +4181,65 @@ define <2 x i64> @test_v2f64_uge_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ult_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ult_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnlepd {{.*}}, %xmm3
-; SSE-NEXT:    andpd %xmm3, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm3
-; SSE-NEXT:    orpd %xmm3, %xmm0
-;
-; AVX-LABEL: test_v2f64_ult_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlepd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ult_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpnlepd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ult_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnlepd %xmm2, %xmm3
+; SSE-64-NEXT:    andpd %xmm3, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm3
+; SSE-64-NEXT:    orpd %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ult_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnlepd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ult_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlepd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngepd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngepd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -1585,27 +4248,65 @@ define <2 x i64> @test_v2f64_ult_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_ule_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_ule_s:
-; SSE:       # %bb.0:
-; SSE:         cmpnltpd {{.*}}, %xmm3
-; SSE-NEXT:    andpd %xmm3, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm3
-; SSE-NEXT:    orpd %xmm3, %xmm0
-;
-; AVX-LABEL: test_v2f64_ule_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltpd {{.*}}, %xmm3, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_ule_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    cmpnltpd %xmm2, %xmm3
+; SSE-32-NEXT:    andpd %xmm3, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm3
+; SSE-32-NEXT:    orpd %xmm3, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_ule_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    cmpnltpd %xmm2, %xmm3
+; SSE-64-NEXT:    andpd %xmm3, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm3
+; SSE-64-NEXT:    orpd %xmm3, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_ule_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %xmm3
+; AVX-32-NEXT:    vcmpnltpd %xmm2, %xmm3, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_ule_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltpd %xmm2, %xmm3, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtpd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpngtpd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %xmm2, %xmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -1614,28 +4315,68 @@ define <2 x i64> @test_v2f64_ule_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_une_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_une_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE-NEXT:    cmpneqpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_une_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_uspd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_une_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpneqpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_une_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpneqpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_une_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpneq_uspd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_une_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_uspd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_uspd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpneq_uspd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_uspd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -1644,28 +4385,68 @@ define <2 x i64> @test_v2f64_une_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 }
 
 define <2 x i64> @test_v2f64_uno_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1, <2 x double> %f2) #0 {
-; SSE-LABEL: test_v2f64_uno_s:
-; SSE:       # %bb.0:
-; SSE:         cmpltpd %xmm3, %xmm4
-; SSE-NEXT:    cmpunordpd %xmm3, %xmm2
-; SSE-NEXT:    andpd %xmm2, %xmm0
-; SSE-NEXT:    andnpd %xmm1, %xmm2
-; SSE-NEXT:    orpd %xmm2, %xmm0
-;
-; AVX-LABEL: test_v2f64_uno_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpunord_spd {{.*}}, %xmm2, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; SSE-32-LABEL: test_v2f64_uno_s:
+; SSE-32:       # %bb.0:
+; SSE-32-NEXT:    pushl %ebp
+; SSE-32-NEXT:    movl %esp, %ebp
+; SSE-32-NEXT:    andl $-16, %esp
+; SSE-32-NEXT:    subl $16, %esp
+; SSE-32-NEXT:    movapd 8(%ebp), %xmm3
+; SSE-32-NEXT:    movapd %xmm2, %xmm4
+; SSE-32-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-32-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-32-NEXT:    andpd %xmm2, %xmm0
+; SSE-32-NEXT:    andnpd %xmm1, %xmm2
+; SSE-32-NEXT:    orpd %xmm2, %xmm0
+; SSE-32-NEXT:    movl %ebp, %esp
+; SSE-32-NEXT:    popl %ebp
+; SSE-32-NEXT:    retl
+;
+; SSE-64-LABEL: test_v2f64_uno_s:
+; SSE-64:       # %bb.0:
+; SSE-64-NEXT:    movapd %xmm2, %xmm4
+; SSE-64-NEXT:    cmpltpd %xmm3, %xmm4
+; SSE-64-NEXT:    cmpunordpd %xmm3, %xmm2
+; SSE-64-NEXT:    andpd %xmm2, %xmm0
+; SSE-64-NEXT:    andnpd %xmm1, %xmm2
+; SSE-64-NEXT:    orpd %xmm2, %xmm0
+; SSE-64-NEXT:    retq
+;
+; AVX-32-LABEL: test_v2f64_uno_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-16, %esp
+; AVX-32-NEXT:    subl $16, %esp
+; AVX-32-NEXT:    vcmpunord_spd 8(%ebp), %xmm2, %xmm2
+; AVX-32-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v2f64_uno_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunord_spd %xmm3, %xmm2, %xmm2
+; AVX-64-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v2f64_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_spd 8(%ebp), %xmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-16, %esp
+; AVX512-32-NEXT:    subl $16, %esp
+; AVX512-32-NEXT:    vcmpunord_spd 8(%ebp), %xmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v2f64_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_spd %xmm3, %xmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(
                                                <2 x double> %f1, <2 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -1673,7 +4454,7 @@ define <2 x i64> @test_v2f64_uno_s(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
   ret <2 x i64> %res
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
 
 declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float>, <4 x float>, metadata, metadata)
 declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double>, <2 x double>, metadata, metadata)

diff  --git a/llvm/test/CodeGen/X86/vec-strict-256-cmp.ll b/llvm/test/CodeGen/X86/vec-strict-256-cmp.ll
index 421f03c1466b..583c2b484cf3 100644
--- a/llvm/test/CodeGen/X86/vec-strict-256-cmp.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-256-cmp.ll
@@ -1,23 +1,45 @@
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-64
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-64
 
 define <8 x i32> @test_v8f32_oeq_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_oeq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_oeq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_oeq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -26,20 +48,42 @@ define <8 x i32> @test_v8f32_oeq_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ogt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ogt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ogt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmplt_oqps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ogt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgt_oqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -48,20 +92,42 @@ define <8 x i32> @test_v8f32_ogt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_oge_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_oge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_oge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmple_oqps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_oge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpge_oqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -70,20 +136,41 @@ define <8 x i32> @test_v8f32_oge_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_olt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_olt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_olt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmplt_oqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_olt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmplt_oqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -92,20 +179,41 @@ define <8 x i32> @test_v8f32_olt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ole_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ole_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ole_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmple_oqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ole_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmple_oqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -114,20 +222,41 @@ define <8 x i32> @test_v8f32_ole_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_one_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_one_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_oqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_one_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_oqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_one_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_oqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_oqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -136,20 +265,41 @@ define <8 x i32> @test_v8f32_one_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ord_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ord_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpordps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ord_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpordps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ord_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpordps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpordps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -158,20 +308,41 @@ define <8 x i32> @test_v8f32_ord_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ueq_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ueq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ueq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_uqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ueq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_uqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -180,20 +351,41 @@ define <8 x i32> @test_v8f32_ueq_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ugt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ugt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ugt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnle_uqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ugt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnle_uqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -202,20 +394,41 @@ define <8 x i32> @test_v8f32_ugt_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_uge_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_uge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_uge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnlt_uqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_uge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -224,20 +437,42 @@ define <8 x i32> @test_v8f32_uge_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ult_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ult_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ult_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnle_uqps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ult_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnge_uqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -246,20 +481,42 @@ define <8 x i32> @test_v8f32_ult_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ule_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ule_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ule_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnlt_uqps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ule_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngt_uqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -268,20 +525,41 @@ define <8 x i32> @test_v8f32_ule_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_une_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_une_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneqps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_une_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneqps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_une_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneqps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneqps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -290,20 +568,41 @@ define <8 x i32> @test_v8f32_une_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_uno_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_uno_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpunordps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_uno_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpunordps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_uno_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunordps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunordps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -312,20 +611,41 @@ define <8 x i32> @test_v8f32_uno_q(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <4 x i64> @test_v4f64_oeq_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_oeq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_oeq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_oeq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -334,20 +654,42 @@ define <4 x i64> @test_v4f64_oeq_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ogt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ogt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ogt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmplt_oqpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ogt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgt_oqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -356,20 +698,42 @@ define <4 x i64> @test_v4f64_ogt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_oge_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_oge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_oge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmple_oqpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_oge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpge_oqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -378,20 +742,41 @@ define <4 x i64> @test_v4f64_oge_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_olt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_olt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmplt_oqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_olt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmplt_oqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_olt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplt_oqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmplt_oqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -400,20 +785,41 @@ define <4 x i64> @test_v4f64_olt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ole_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ole_q:
-; AVX:       # %bb.0:
-; AVX:         vcmple_oqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ole_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmple_oqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ole_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmple_oqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmple_oqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -422,20 +828,41 @@ define <4 x i64> @test_v4f64_ole_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_one_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_one_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_oqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_one_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_oqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_one_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_oqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_oqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -444,20 +871,41 @@ define <4 x i64> @test_v4f64_one_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ord_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ord_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpordpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ord_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpordpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ord_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpordpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpordpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -466,20 +914,41 @@ define <4 x i64> @test_v4f64_ord_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ueq_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ueq_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ueq_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_uqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ueq_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_uqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -488,20 +957,41 @@ define <4 x i64> @test_v4f64_ueq_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ugt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ugt_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ugt_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnle_uqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ugt_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnle_uqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -510,20 +1000,41 @@ define <4 x i64> @test_v4f64_ugt_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_uge_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_uge_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_uge_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnlt_uqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_uge_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -532,20 +1043,42 @@ define <4 x i64> @test_v4f64_uge_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ult_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ult_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnle_uqpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ult_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnle_uqpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ult_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnle_uqpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnge_uqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -554,20 +1087,42 @@ define <4 x i64> @test_v4f64_ult_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ule_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ule_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlt_uqpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ule_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnlt_uqpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ule_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlt_uqpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngt_uqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -576,20 +1131,41 @@ define <4 x i64> @test_v4f64_ule_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_une_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_une_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpneqpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_une_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneqpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_une_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneqpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneqpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -598,20 +1174,41 @@ define <4 x i64> @test_v4f64_une_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_uno_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_uno_q:
-; AVX:       # %bb.0:
-; AVX:         vcmpunordpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_uno_q:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpunordpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_uno_q:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunordpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunordpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -620,20 +1217,41 @@ define <4 x i64> @test_v4f64_uno_q(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <8 x i32> @test_v8f32_oeq_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_oeq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_osps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_oeq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_osps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_oeq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_osps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_osps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_osps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_osps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -642,20 +1260,42 @@ define <8 x i32> @test_v8f32_oeq_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ogt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ogt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ogt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpltps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ogt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgtps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -664,20 +1304,42 @@ define <8 x i32> @test_v8f32_ogt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_oge_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_oge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpleps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_oge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpleps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_oge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpleps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgeps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgeps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -686,20 +1348,41 @@ define <8 x i32> @test_v8f32_oge_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_olt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_olt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_olt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpltps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_olt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpltps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -708,20 +1391,41 @@ define <8 x i32> @test_v8f32_olt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ole_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ole_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpleps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ole_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpleps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ole_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpleps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpleps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpleps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -730,20 +1434,41 @@ define <8 x i32> @test_v8f32_ole_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_one_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_one_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_osps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_one_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_osps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_one_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_osps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_osps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_osps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_osps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -752,20 +1477,41 @@ define <8 x i32> @test_v8f32_one_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ord_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ord_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpord_sps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ord_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpord_sps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ord_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpord_sps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_sps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpord_sps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_sps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -774,20 +1520,41 @@ define <8 x i32> @test_v8f32_ord_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ueq_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ueq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_usps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ueq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_usps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ueq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_usps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_usps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_usps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_usps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -796,20 +1563,41 @@ define <8 x i32> @test_v8f32_ueq_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ugt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ugt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnleps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ugt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnleps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ugt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnleps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnleps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnleps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -818,20 +1606,41 @@ define <8 x i32> @test_v8f32_ugt_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_uge_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_uge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_uge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnltps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_uge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnltps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -840,20 +1649,42 @@ define <8 x i32> @test_v8f32_uge_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ult_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ult_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnleps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ult_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnleps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ult_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnleps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngeps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngeps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -862,20 +1693,42 @@ define <8 x i32> @test_v8f32_ult_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_ule_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_ule_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltps {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_ule_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovaps 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnltps %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_ule_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltps %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngtps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -884,20 +1737,41 @@ define <8 x i32> @test_v8f32_ule_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_une_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_une_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_usps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_une_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_usps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_une_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_usps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_usps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_usps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_usps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -906,20 +1780,41 @@ define <8 x i32> @test_v8f32_une_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <8 x i32> @test_v8f32_uno_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1, <8 x float> %f2) #0 {
-; AVX-LABEL: test_v8f32_uno_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpunord_sps {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v8f32_uno_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpunord_sps 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v8f32_uno_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunord_sps %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v8f32_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_sps 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunord_sps 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f32_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_sps %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(
                                                <8 x float> %f1, <8 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -928,20 +1823,41 @@ define <8 x i32> @test_v8f32_uno_s(<8 x i32> %a, <8 x i32> %b, <8 x float> %f1,
 }
 
 define <4 x i64> @test_v4f64_oeq_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_oeq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_ospd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_oeq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_ospd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_oeq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_ospd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_ospd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_ospd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_ospd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -950,20 +1866,42 @@ define <4 x i64> @test_v4f64_oeq_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ogt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ogt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ogt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpltpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ogt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgtpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -972,20 +1910,42 @@ define <4 x i64> @test_v4f64_ogt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_oge_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_oge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmplepd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_oge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmplepd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_oge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplepd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgepd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgepd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -994,20 +1954,41 @@ define <4 x i64> @test_v4f64_oge_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_olt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_olt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpltpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_olt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpltpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_olt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpltpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpltpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -1016,20 +1997,41 @@ define <4 x i64> @test_v4f64_olt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ole_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ole_s:
-; AVX:       # %bb.0:
-; AVX:         vcmplepd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ole_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmplepd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ole_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmplepd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplepd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmplepd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -1038,20 +2040,41 @@ define <4 x i64> @test_v4f64_ole_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_one_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_one_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_ospd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_one_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_ospd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_one_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_ospd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_ospd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_ospd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_ospd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -1060,20 +2083,41 @@ define <4 x i64> @test_v4f64_one_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ord_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ord_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpord_spd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ord_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpord_spd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ord_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpord_spd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_spd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpord_spd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_spd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -1082,20 +2126,41 @@ define <4 x i64> @test_v4f64_ord_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ueq_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ueq_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpeq_uspd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ueq_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpeq_uspd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ueq_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpeq_uspd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uspd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_uspd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uspd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -1104,20 +2169,41 @@ define <4 x i64> @test_v4f64_ueq_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ugt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ugt_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlepd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ugt_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnlepd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ugt_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlepd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlepd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnlepd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -1126,20 +2212,41 @@ define <4 x i64> @test_v4f64_ugt_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_uge_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_uge_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltpd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_uge_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpnltpd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_uge_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltpd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnltpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -1148,20 +2255,42 @@ define <4 x i64> @test_v4f64_uge_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ult_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ult_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnlepd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ult_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnlepd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ult_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnlepd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngepd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngepd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -1170,20 +2299,42 @@ define <4 x i64> @test_v4f64_ult_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_ule_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_ule_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpnltpd {{.*}}, %ymm3, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_ule_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vmovapd 8(%ebp), %ymm3
+; AVX-32-NEXT:    vcmpnltpd %ymm2, %ymm3, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_ule_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpnltpd %ymm2, %ymm3, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtpd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngtpd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %ymm2, %ymm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -1192,20 +2343,41 @@ define <4 x i64> @test_v4f64_ule_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_une_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_une_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpneq_uspd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_une_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpneq_uspd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_une_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpneq_uspd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_uspd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_uspd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_uspd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -1214,20 +2386,41 @@ define <4 x i64> @test_v4f64_une_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
 }
 
 define <4 x i64> @test_v4f64_uno_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1, <4 x double> %f2) #0 {
-; AVX-LABEL: test_v4f64_uno_s:
-; AVX:       # %bb.0:
-; AVX:         vcmpunord_spd {{.*}}, %ymm2, %ymm2
-; AVX-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-LABEL: test_v4f64_uno_s:
+; AVX-32:       # %bb.0:
+; AVX-32-NEXT:    pushl %ebp
+; AVX-32-NEXT:    movl %esp, %ebp
+; AVX-32-NEXT:    andl $-32, %esp
+; AVX-32-NEXT:    subl $32, %esp
+; AVX-32-NEXT:    vcmpunord_spd 8(%ebp), %ymm2, %ymm2
+; AVX-32-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-32-NEXT:    movl %ebp, %esp
+; AVX-32-NEXT:    popl %ebp
+; AVX-32-NEXT:    retl
+;
+; AVX-64-LABEL: test_v4f64_uno_s:
+; AVX-64:       # %bb.0:
+; AVX-64-NEXT:    vcmpunord_spd %ymm3, %ymm2, %ymm2
+; AVX-64-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX-64-NEXT:    retq
 ;
 ; AVX512-32-LABEL: test_v4f64_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_spd 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunord_spd 8(%ebp), %ymm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v4f64_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_spd %ymm3, %ymm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(
                                                <4 x double> %f1, <4 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -1235,7 +2428,7 @@ define <4 x i64> @test_v4f64_uno_s(<4 x i64> %a, <4 x i64> %b, <4 x double> %f1,
   ret <4 x i64> %res
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
 
 declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float>, <8 x float>, metadata, metadata)
 declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double>, <4 x double>, metadata, metadata)

diff  --git a/llvm/test/CodeGen/X86/vec-strict-512-cmp.ll b/llvm/test/CodeGen/X86/vec-strict-512-cmp.ll
index ff39fa221867..25b6525737fb 100644
--- a/llvm/test/CodeGen/X86/vec-strict-512-cmp.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-512-cmp.ll
@@ -1,16 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-64
 
 define <16 x i32> @test_v16f32_oeq_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -21,13 +30,21 @@ define <16 x i32> @test_v16f32_oeq_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ogt_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgt_oqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -38,13 +55,21 @@ define <16 x i32> @test_v16f32_ogt_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_oge_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpge_oqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -55,13 +80,21 @@ define <16 x i32> @test_v16f32_oge_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_olt_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmplt_oqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -72,13 +105,21 @@ define <16 x i32> @test_v16f32_olt_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ole_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmple_oqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -89,13 +130,21 @@ define <16 x i32> @test_v16f32_ole_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_one_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_oqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -106,13 +155,21 @@ define <16 x i32> @test_v16f32_one_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ord_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpordps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -123,13 +180,21 @@ define <16 x i32> @test_v16f32_ord_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ueq_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_uqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -140,13 +205,21 @@ define <16 x i32> @test_v16f32_ueq_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ugt_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnle_uqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -157,13 +230,21 @@ define <16 x i32> @test_v16f32_ugt_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_uge_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -174,13 +255,21 @@ define <16 x i32> @test_v16f32_uge_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ult_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnge_uqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -191,13 +280,21 @@ define <16 x i32> @test_v16f32_ult_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ule_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngt_uqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -208,13 +305,21 @@ define <16 x i32> @test_v16f32_ule_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_une_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneqps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -225,13 +330,21 @@ define <16 x i32> @test_v16f32_une_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_uno_q(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpunordps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -242,13 +355,21 @@ define <16 x i32> @test_v16f32_uno_q(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <8 x i64> @test_v8f64_oeq_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_oeq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_oeq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -259,13 +380,21 @@ define <8 x i64> @test_v8f64_oeq_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ogt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ogt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgt_oqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgt_oqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ogt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -276,13 +405,21 @@ define <8 x i64> @test_v8f64_ogt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_oge_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_oge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpge_oqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpge_oqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_oge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -293,13 +430,21 @@ define <8 x i64> @test_v8f64_oge_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_olt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_olt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplt_oqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmplt_oqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_olt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplt_oqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -310,13 +455,21 @@ define <8 x i64> @test_v8f64_olt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ole_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ole_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmple_oqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmple_oqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ole_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmple_oqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -327,13 +480,21 @@ define <8 x i64> @test_v8f64_ole_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_one_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_one_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_oqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_oqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_one_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_oqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -344,13 +505,21 @@ define <8 x i64> @test_v8f64_one_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ord_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ord_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpordpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpordpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ord_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpordpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -361,13 +530,21 @@ define <8 x i64> @test_v8f64_ord_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ueq_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ueq_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_uqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ueq_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -378,13 +555,21 @@ define <8 x i64> @test_v8f64_ueq_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ugt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ugt_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnle_uqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnle_uqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ugt_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -395,13 +580,21 @@ define <8 x i64> @test_v8f64_ugt_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_uge_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_uge_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlt_uqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_uge_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -412,13 +605,21 @@ define <8 x i64> @test_v8f64_uge_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ult_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ult_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnge_uqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnge_uqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ult_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnle_uqpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -429,13 +630,21 @@ define <8 x i64> @test_v8f64_ult_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ule_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ule_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngt_uqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngt_uqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ule_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlt_uqpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -446,13 +655,21 @@ define <8 x i64> @test_v8f64_ule_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_une_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_une_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneqpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneqpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_une_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneqpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -463,13 +680,21 @@ define <8 x i64> @test_v8f64_une_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_uno_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_uno_q:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunordpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpunordpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_uno_q:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunordpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -480,13 +705,21 @@ define <8 x i64> @test_v8f64_uno_q(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <16 x i32> @test_v16f32_oeq_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_osps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_osps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_osps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -497,13 +730,21 @@ define <16 x i32> @test_v16f32_oeq_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ogt_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgtps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -514,13 +755,21 @@ define <16 x i32> @test_v16f32_ogt_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_oge_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgeps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgeps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -531,13 +780,21 @@ define <16 x i32> @test_v16f32_oge_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_olt_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpltps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -548,13 +805,21 @@ define <16 x i32> @test_v16f32_olt_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ole_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpleps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpleps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpleps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -565,13 +830,21 @@ define <16 x i32> @test_v16f32_ole_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_one_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_osps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_osps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_osps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -582,13 +855,21 @@ define <16 x i32> @test_v16f32_one_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ord_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_sps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpord_sps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_sps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -599,13 +880,21 @@ define <16 x i32> @test_v16f32_ord_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ueq_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_usps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_usps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_usps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -616,13 +905,21 @@ define <16 x i32> @test_v16f32_ueq_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ugt_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnleps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnleps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -633,13 +930,21 @@ define <16 x i32> @test_v16f32_ugt_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_uge_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnltps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -650,13 +955,21 @@ define <16 x i32> @test_v16f32_uge_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ult_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngeps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngeps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnleps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -667,13 +980,21 @@ define <16 x i32> @test_v16f32_ult_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_ule_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngtps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltps %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -684,13 +1005,21 @@ define <16 x i32> @test_v16f32_ule_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_une_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_usps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_usps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_usps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -701,13 +1030,21 @@ define <16 x i32> @test_v16f32_une_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <16 x i32> @test_v16f32_uno_s(<16 x i32> %a, <16 x i32> %b, <16 x float> %f1, <16 x float> %f2) #0 {
 ; AVX512-32-LABEL: test_v16f32_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_sps 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpunord_sps 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v16f32_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_sps %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(
                                                <16 x float> %f1, <16 x float> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -718,13 +1055,21 @@ define <16 x i32> @test_v16f32_uno_s(<16 x i32> %a, <16 x i32> %b, <16 x float>
 define <8 x i64> @test_v8f64_oeq_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_oeq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_ospd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_ospd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_oeq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_ospd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"oeq",
                                                metadata !"fpexcept.strict") #0
@@ -735,13 +1080,21 @@ define <8 x i64> @test_v8f64_oeq_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ogt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ogt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgtpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgtpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ogt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ogt",
                                                metadata !"fpexcept.strict") #0
@@ -752,13 +1105,21 @@ define <8 x i64> @test_v8f64_ogt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_oge_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_oge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpgepd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpgepd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_oge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"oge",
                                                metadata !"fpexcept.strict") #0
@@ -769,13 +1130,21 @@ define <8 x i64> @test_v8f64_oge_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_olt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_olt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpltpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpltpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_olt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpltpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"olt",
                                                metadata !"fpexcept.strict") #0
@@ -786,13 +1155,21 @@ define <8 x i64> @test_v8f64_olt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ole_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ole_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmplepd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmplepd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ole_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmplepd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ole",
                                                metadata !"fpexcept.strict") #0
@@ -803,13 +1180,21 @@ define <8 x i64> @test_v8f64_ole_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_one_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_one_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_ospd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_ospd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_one_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_ospd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"one",
                                                metadata !"fpexcept.strict") #0
@@ -820,13 +1205,21 @@ define <8 x i64> @test_v8f64_one_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ord_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ord_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpord_spd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpord_spd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ord_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpord_spd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ord",
                                                metadata !"fpexcept.strict") #0
@@ -837,13 +1230,21 @@ define <8 x i64> @test_v8f64_ord_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ueq_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ueq_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpeq_uspd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpeq_uspd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ueq_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpeq_uspd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ueq",
                                                metadata !"fpexcept.strict") #0
@@ -854,13 +1255,21 @@ define <8 x i64> @test_v8f64_ueq_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ugt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ugt_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnlepd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnlepd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ugt_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ugt",
                                                metadata !"fpexcept.strict") #0
@@ -871,13 +1280,21 @@ define <8 x i64> @test_v8f64_ugt_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_uge_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_uge_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpnltpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpnltpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_uge_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"uge",
                                                metadata !"fpexcept.strict") #0
@@ -888,13 +1305,21 @@ define <8 x i64> @test_v8f64_uge_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ult_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ult_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngepd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngepd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ult_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnlepd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ult",
                                                metadata !"fpexcept.strict") #0
@@ -905,13 +1330,21 @@ define <8 x i64> @test_v8f64_ult_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_ule_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_ule_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpngtpd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpngtpd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_ule_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpnltpd %zmm2, %zmm3, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"ule",
                                                metadata !"fpexcept.strict") #0
@@ -922,13 +1355,21 @@ define <8 x i64> @test_v8f64_ule_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_une_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_une_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpneq_uspd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpneq_uspd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_une_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpneq_uspd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"une",
                                                metadata !"fpexcept.strict") #0
@@ -939,13 +1380,21 @@ define <8 x i64> @test_v8f64_une_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
 define <8 x i64> @test_v8f64_uno_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1, <8 x double> %f2) #0 {
 ; AVX512-32-LABEL: test_v8f64_uno_s:
 ; AVX512-32:       # %bb.0:
-; AVX512-32:         vcmpunord_spd 8(%ebp), %zmm2, %k1
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-64, %esp
+; AVX512-32-NEXT:    subl $64, %esp
+; AVX512-32-NEXT:    vcmpunord_spd 8(%ebp), %zmm2, %k1
 ; AVX512-32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
 ;
 ; AVX512-64-LABEL: test_v8f64_uno_s:
 ; AVX512-64:       # %bb.0:
 ; AVX512-64-NEXT:    vcmpunord_spd %zmm3, %zmm2, %k1
 ; AVX512-64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-64-NEXT:    retq
   %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(
                                                <8 x double> %f1, <8 x double> %f2, metadata !"uno",
                                                metadata !"fpexcept.strict") #0
@@ -953,7 +1402,7 @@ define <8 x i64> @test_v8f64_uno_s(<8 x i64> %a, <8 x i64> %b, <8 x double> %f1,
   ret <8 x i64> %res
 }
 
-attributes #0 = { strictfp }
+attributes #0 = { strictfp nounwind }
 
 declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float>, <16 x float>, metadata, metadata)
 declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double>, <8 x double>, metadata, metadata)


        


More information about the llvm-commits mailing list