[llvm] 9182dc7 - [X86] Add llvm.roundeven test cases. Add f80 tests cases for constrained intrinsics that lower to libcalls. NFC

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat Jul 25 13:31:11 PDT 2020


Author: Craig Topper
Date: 2020-07-25T13:29:47-07:00
New Revision: 9182dc78145b9f1505d7fcc34b818f6d8aabcfda

URL: https://github.com/llvm/llvm-project/commit/9182dc78145b9f1505d7fcc34b818f6d8aabcfda
DIFF: https://github.com/llvm/llvm-project/commit/9182dc78145b9f1505d7fcc34b818f6d8aabcfda.diff

LOG: [X86] Add llvm.roundeven test cases. Add f80 tests cases for constrained intrinsics that lower to libcalls. NFC

Added: 
    llvm/test/CodeGen/X86/fp-roundeven.ll
    llvm/test/CodeGen/X86/fp80-strict-libcalls.ll

Modified: 
    llvm/test/CodeGen/X86/fp-cvt.ll
    llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
    llvm/test/CodeGen/X86/fp128-libcalls-strict.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll
index 667c2d414ed1..cedbfd2e9bff 100644
--- a/llvm/test/CodeGen/X86/fp-cvt.ll
+++ b/llvm/test/CodeGen/X86/fp-cvt.ll
@@ -1090,3 +1090,55 @@ define x86_fp80 @rint_fp80_ld(x86_fp80 *%a0) nounwind {
 }
 
 declare x86_fp80 @llvm.rint.f80(x86_fp80 %p)
+
+;
+; roundeven
+;
+
+define x86_fp80 @roundeven_fp80(x86_fp80 %a0) nounwind {
+; X86-LABEL: roundeven_fp80:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll roundevenl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: roundeven_fp80:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq roundevenl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+  %1 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %a0)
+  ret x86_fp80 %1
+}
+
+define x86_fp80 @roundeven_fp80_ld(x86_fp80 *%a0) nounwind {
+; X86-LABEL: roundeven_fp80_ld:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    fldt (%eax)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll roundevenl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: roundeven_fp80_ld:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt (%rdi)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq roundevenl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+  %1 = load x86_fp80, x86_fp80 *%a0
+  %2 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %1)
+  ret x86_fp80 %2
+}
+
+declare x86_fp80 @llvm.roundeven.f80(x86_fp80 %p)

diff  --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
new file mode 100644
index 000000000000..a3eae0137f3e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -0,0 +1,1044 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+
+define float @roundeven_f32(float %x) {
+; SSE2-LABEL: roundeven_f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    jmp _roundevenf ## TAILCALL
+;
+; SSE41-LABEL: roundeven_f32:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    jmp _roundevenf ## TAILCALL
+;
+; AVX-LABEL: roundeven_f32:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    jmp _roundevenf ## TAILCALL
+  %a = call float @llvm.roundeven.f32(float %x)
+  ret float %a
+}
+
+define double @roundeven_f64(double %x) {
+; SSE2-LABEL: roundeven_f64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    jmp _roundeven ## TAILCALL
+;
+; SSE41-LABEL: roundeven_f64:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    jmp _roundeven ## TAILCALL
+;
+; AVX-LABEL: roundeven_f64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    jmp _roundeven ## TAILCALL
+  %a = call double @llvm.roundeven.f64(double %x)
+  ret double %a
+}
+
+define <4 x float> @roundeven_v4f32(<4 x float> %x) {
+; SSE2-LABEL: roundeven_v4f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $56, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    addq $56, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v4f32:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $40, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 48
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    addq $40, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: roundeven_v4f32:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    retq
+  %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
+  ret <4 x float> %a
+}
+
+define <2 x double> @roundeven_v2f64(<2 x double> %x) {
+; SSE2-LABEL: roundeven_v2f64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $40, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 48
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    addq $40, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v2f64:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $40, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 48
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    addq $40, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: roundeven_v2f64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    retq
+  %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
+  ret <2 x double> %a
+}
+
+define <8 x float> @roundeven_v8f32(<8 x float> %x) {
+; SSE2-LABEL: roundeven_v8f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $72, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 80
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    addq $72, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v8f32:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $56, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 64
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movshdup (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    addq $56, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: roundeven_v8f32:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    subq $88, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 96
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT:    callq _roundevenf
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX-NEXT:    addq $88, %rsp
+; AVX-NEXT:    retq
+  %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x)
+  ret <8 x float> %a
+}
+
+define <4 x double> @roundeven_v4f64(<4 x double> %x) {
+; SSE2-LABEL: roundeven_v4f64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $56, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 64
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    addq $56, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v4f64:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $56, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 64
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    addq $56, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: roundeven_v4f64:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    subq $88, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 96
+; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT:    ## xmm0 = mem[1,0]
+; AVX-NEXT:    callq _roundeven
+; AVX-NEXT:    vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX-NEXT:    addq $88, %rsp
+; AVX-NEXT:    retq
+  %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x)
+  ret <4 x double> %a
+}
+
+define <16 x float> @roundeven_v16f32(<16 x float> %x) {
+; SSE2-LABEL: roundeven_v16f32:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $104, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 112
+; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    callq _roundevenf
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload
+; SSE2-NEXT:    ## xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v16f32:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $88, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 96
+; SSE41-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movshdup (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT:    ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    callq _roundevenf
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0]
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE41-NEXT:    addq $88, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: roundeven_v16f32:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    subq $152, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 160
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT:    callq _roundevenf
+; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    addq $152, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: roundeven_v16f32:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    subq $184, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 192
+; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT:    callq _roundevenf
+; AVX512-NEXT:    vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload
+; AVX512-NEXT:    addq $184, %rsp
+; AVX512-NEXT:    retq
+  %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x)
+  ret <16 x float> %a
+}
+
+define <8 x double> @roundeven_v8f64(<8 x double> %x) {
+; SSE2-LABEL: roundeven_v8f64:
+; SSE2:       ## %bb.0:
+; SSE2-NEXT:    subq $88, %rsp
+; SSE2-NEXT:    .cfi_def_cfa_offset 96
+; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    callq _roundeven
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE2-NEXT:    addq $88, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: roundeven_v8f64:
+; SSE41:       ## %bb.0:
+; SSE41-NEXT:    subq $88, %rsp
+; SSE41-NEXT:    .cfi_def_cfa_offset 96
+; SSE41-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT:    movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT:    callq _roundeven
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE41-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT:    movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE41-NEXT:    addq $88, %rsp
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: roundeven_v8f64:
+; AVX1:       ## %bb.0:
+; AVX1-NEXT:    subq $120, %rsp
+; AVX1-NEXT:    .cfi_def_cfa_offset 128
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT:    ## xmm0 = mem[1,0]
+; AVX1-NEXT:    callq _roundeven
+; AVX1-NEXT:    vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT:    addq $120, %rsp
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: roundeven_v8f64:
+; AVX512:       ## %bb.0:
+; AVX512-NEXT:    subq $184, %rsp
+; AVX512-NEXT:    .cfi_def_cfa_offset 192
+; AVX512-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    ## xmm0 = mem[1,0]
+; AVX512-NEXT:    callq _roundeven
+; AVX512-NEXT:    vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT:    vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload
+; AVX512-NEXT:    addq $184, %rsp
+; AVX512-NEXT:    retq
+  %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x)
+  ret <8 x double> %a
+}
+
+declare float @llvm.roundeven.f32(float)
+declare double @llvm.roundeven.f64(double)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
+declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
+declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
+declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
+declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)

diff  --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
index da05e8be432e..f5a6af9c4d65 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
@@ -16,6 +16,10 @@ declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata)
 declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
 declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata)
 declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare float @llvm.experimental.constrained.round.f32(float, metadata)
+declare double @llvm.experimental.constrained.round.f64(double, metadata)
+declare float @llvm.experimental.constrained.roundeven.f32(float, metadata)
+declare double @llvm.experimental.constrained.roundeven.f64(double, metadata)
 
 define float @fceil32(float %f) #0 {
 ; SSE41-X86-LABEL: fceil32:
@@ -491,4 +495,184 @@ define double @fnearbyintf64(double %f) #0 {
   ret double %res
 }
 
+define float @fround32(float %f) #0 {
+; SSE41-X86-LABEL: fround32:
+; SSE41-X86:       # %bb.0:
+; SSE41-X86-NEXT:    pushl %eax
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-X86-NEXT:    movss %xmm0, (%esp)
+; SSE41-X86-NEXT:    calll roundf
+; SSE41-X86-NEXT:    popl %eax
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT:    retl
+;
+; SSE41-X64-LABEL: fround32:
+; SSE41-X64:       # %bb.0:
+; SSE41-X64-NEXT:    pushq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT:    callq roundf
+; SSE41-X64-NEXT:    popq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X86-LABEL: fround32:
+; AVX-X86:       # %bb.0:
+; AVX-X86-NEXT:    pushl %eax
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
+; AVX-X86-NEXT:    calll roundf
+; AVX-X86-NEXT:    popl %eax
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
+; AVX-X86-NEXT:    retl
+;
+; AVX-X64-LABEL: fround32:
+; AVX-X64:       # %bb.0:
+; AVX-X64-NEXT:    pushq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 16
+; AVX-X64-NEXT:    callq roundf
+; AVX-X64-NEXT:    popq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X64-NEXT:    retq
+  %res = call float @llvm.experimental.constrained.round.f32(
+                        float %f, metadata !"fpexcept.strict") #0
+  ret float %res
+}
+
+define double @froundf64(double %f) #0 {
+; SSE41-X86-LABEL: froundf64:
+; SSE41-X86:       # %bb.0:
+; SSE41-X86-NEXT:    subl $8, %esp
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 12
+; SSE41-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
+; SSE41-X86-NEXT:    calll round
+; SSE41-X86-NEXT:    addl $8, %esp
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT:    retl
+;
+; SSE41-X64-LABEL: froundf64:
+; SSE41-X64:       # %bb.0:
+; SSE41-X64-NEXT:    pushq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT:    callq round
+; SSE41-X64-NEXT:    popq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X86-LABEL: froundf64:
+; AVX-X86:       # %bb.0:
+; AVX-X86-NEXT:    subl $8, %esp
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 12
+; AVX-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-X86-NEXT:    calll round
+; AVX-X86-NEXT:    addl $8, %esp
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
+; AVX-X86-NEXT:    retl
+;
+; AVX-X64-LABEL: froundf64:
+; AVX-X64:       # %bb.0:
+; AVX-X64-NEXT:    pushq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 16
+; AVX-X64-NEXT:    callq round
+; AVX-X64-NEXT:    popq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X64-NEXT:    retq
+  %res = call double @llvm.experimental.constrained.round.f64(
+                        double %f, metadata !"fpexcept.strict") #0
+  ret double %res
+}
+
+define float @froundeven32(float %f) #0 {
+; SSE41-X86-LABEL: froundeven32:
+; SSE41-X86:       # %bb.0:
+; SSE41-X86-NEXT:    pushl %eax
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-X86-NEXT:    movss %xmm0, (%esp)
+; SSE41-X86-NEXT:    calll roundevenf
+; SSE41-X86-NEXT:    popl %eax
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT:    retl
+;
+; SSE41-X64-LABEL: froundeven32:
+; SSE41-X64:       # %bb.0:
+; SSE41-X64-NEXT:    pushq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT:    callq roundevenf
+; SSE41-X64-NEXT:    popq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X86-LABEL: froundeven32:
+; AVX-X86:       # %bb.0:
+; AVX-X86-NEXT:    pushl %eax
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
+; AVX-X86-NEXT:    calll roundevenf
+; AVX-X86-NEXT:    popl %eax
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
+; AVX-X86-NEXT:    retl
+;
+; AVX-X64-LABEL: froundeven32:
+; AVX-X64:       # %bb.0:
+; AVX-X64-NEXT:    pushq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 16
+; AVX-X64-NEXT:    callq roundevenf
+; AVX-X64-NEXT:    popq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X64-NEXT:    retq
+  %res = call float @llvm.experimental.constrained.roundeven.f32(
+                        float %f, metadata !"fpexcept.strict") #0
+  ret float %res
+}
+
+define double @froundevenf64(double %f) #0 {
+; SSE41-X86-LABEL: froundevenf64:
+; SSE41-X86:       # %bb.0:
+; SSE41-X86-NEXT:    subl $8, %esp
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 12
+; SSE41-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-X86-NEXT:    movsd %xmm0, (%esp)
+; SSE41-X86-NEXT:    calll roundeven
+; SSE41-X86-NEXT:    addl $8, %esp
+; SSE41-X86-NEXT:    .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT:    retl
+;
+; SSE41-X64-LABEL: froundevenf64:
+; SSE41-X64:       # %bb.0:
+; SSE41-X64-NEXT:    pushq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT:    callq roundeven
+; SSE41-X64-NEXT:    popq %rax
+; SSE41-X64-NEXT:    .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X86-LABEL: froundevenf64:
+; AVX-X86:       # %bb.0:
+; AVX-X86-NEXT:    subl $8, %esp
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 12
+; AVX-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-X86-NEXT:    calll roundeven
+; AVX-X86-NEXT:    addl $8, %esp
+; AVX-X86-NEXT:    .cfi_def_cfa_offset 4
+; AVX-X86-NEXT:    retl
+;
+; AVX-X64-LABEL: froundevenf64:
+; AVX-X64:       # %bb.0:
+; AVX-X64-NEXT:    pushq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 16
+; AVX-X64-NEXT:    callq roundeven
+; AVX-X64-NEXT:    popq %rax
+; AVX-X64-NEXT:    .cfi_def_cfa_offset 8
+; AVX-X64-NEXT:    retq
+  %res = call double @llvm.experimental.constrained.roundeven.f64(
+                        double %f, metadata !"fpexcept.strict") #0
+  ret double %res
+}
+
 attributes #0 = { strictfp }

diff  --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index b705c760287e..d2be7fb68900 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -913,6 +913,47 @@ entry:
   ret fp128 %round
 }
 
+define fp128 @roundeven(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: roundeven:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq roundevenl
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+;
+; X86-LABEL: roundeven:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll roundevenl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+entry:
+  %roundeven = call fp128 @llvm.experimental.constrained.roundeven.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret fp128 %roundeven
+}
+
 define fp128 @sin(fp128 %x) nounwind strictfp {
 ; CHECK-LABEL: sin:
 ; CHECK:       # %bb.0: # %entry
@@ -1409,6 +1450,7 @@ declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, me
 declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata)
+declare fp128 @llvm.experimental.constrained.roundeven.f128(fp128, metadata)
 declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata)

diff  --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
new file mode 100644
index 000000000000..c199352d1423
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -0,0 +1,657 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
+define x86_fp80 @fma(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z) nounwind strictfp {
+; X86-LABEL: fma:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fmal
+; X86-NEXT:    addl $36, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: fma:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $56, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq fmal
+; X64-NEXT:    addq $56, %rsp
+; X64-NEXT:    retq
+entry:
+  %fma = call x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80 %x, x86_fp80 %y,  x86_fp80 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %fma
+}
+
+define x86_fp80 @frem(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: frem:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fmodl
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: frem:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq fmodl
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
+entry:
+  %div = call x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80 %x, x86_fp80 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %div
+}
+
+define x86_fp80 @ceil(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: ceil:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll ceill
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: ceil:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq ceill
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %ceil = call x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %ceil
+}
+
+define x86_fp80 @cos(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: cos:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll cosl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cos:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq cosl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %cos = call x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %cos
+}
+
+define x86_fp80 @exp(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: exp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll expl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: exp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq expl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %exp = call x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %exp
+}
+
+define x86_fp80 @exp2(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: exp2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll exp2l
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: exp2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq exp2l
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %exp2 = call x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %exp2
+}
+
+define x86_fp80 @floor(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: floor:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll floorl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: floor:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq floorl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %floor = call x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %floor
+}
+
+define x86_fp80 @log(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll logl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: log:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq logl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %log = call x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %log
+}
+
+define x86_fp80 @log10(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log10:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll log10l
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: log10:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq log10l
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %log10 = call x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %log10
+}
+
+define x86_fp80 @log2(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll log2l
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: log2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq log2l
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %log2 = call x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %log2
+}
+
+define x86_fp80 @maxnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: maxnum:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fmaxl
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: maxnum:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq fmaxl
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
+entry:
+  %maxnum = call x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %maxnum
+}
+
+define x86_fp80 @minnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: minnum:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fminl
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: minnum:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq fminl
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
+entry:
+  %minnum = call x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %minnum
+}
+
+define x86_fp80 @nearbyint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: nearbyint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll nearbyintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: nearbyint:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq nearbyintl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %nearbyint = call x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %nearbyint
+}
+
+define x86_fp80 @pow(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: pow:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll powl
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: pow:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq powl
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
+entry:
+  %pow = call x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %pow
+}
+
+define x86_fp80 @powi(x86_fp80 %x, i32 %y) nounwind strictfp {
+; X86-LABEL: powi:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll __powixf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: powi:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq __powixf2
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %powi = call x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80 %x, i32 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %powi
+}
+
+define x86_fp80 @rint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: rint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll rintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: rint:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq rintl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %rint = call x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %rint
+}
+
+define x86_fp80 @round(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: round:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll roundl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: round:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq roundl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %round = call x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %round
+}
+
+define x86_fp80 @roundeven(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: roundeven:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll roundevenl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: roundeven:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq roundevenl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %roundeven = call x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %roundeven
+}
+
+define x86_fp80 @sin(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: sin:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll sinl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sin:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq sinl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %sin = call x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %sin
+}
+
+define x86_fp80 @trunc(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: trunc:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll truncl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq truncl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %trunc = call x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret x86_fp80 %trunc
+}
+
+define i32 @lrint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: lrint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: lrint:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq lrintl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %rint = call i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret i32 %rint
+}
+
+define i64 @llrint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: llrint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: llrint:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq llrintl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %rint = call i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret i64 %rint
+}
+
+define i32 @lround(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: lround:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll lroundl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: lround:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq lroundl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %round = call i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret i32 %round
+}
+
+define i64 @llround(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: llround:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll llroundl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: llround:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq llroundl
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %round = call i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+  ret i64 %round
+}
+
+attributes #0 = { strictfp }
+
+declare x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80, x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80, x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80, i32, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
+declare i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80, metadata, metadata)
+declare i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80, metadata)
+declare i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80, metadata)


        


More information about the llvm-commits mailing list