[llvm] 9182dc7 - [X86] Add llvm.roundeven test cases. Add f80 tests cases for constrained intrinsics that lower to libcalls. NFC
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 25 13:31:11 PDT 2020
Author: Craig Topper
Date: 2020-07-25T13:29:47-07:00
New Revision: 9182dc78145b9f1505d7fcc34b818f6d8aabcfda
URL: https://github.com/llvm/llvm-project/commit/9182dc78145b9f1505d7fcc34b818f6d8aabcfda
DIFF: https://github.com/llvm/llvm-project/commit/9182dc78145b9f1505d7fcc34b818f6d8aabcfda.diff
LOG: [X86] Add llvm.roundeven test cases. Add f80 tests cases for constrained intrinsics that lower to libcalls. NFC
Added:
llvm/test/CodeGen/X86/fp-roundeven.ll
llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
Modified:
llvm/test/CodeGen/X86/fp-cvt.ll
llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/fp-cvt.ll b/llvm/test/CodeGen/X86/fp-cvt.ll
index 667c2d414ed1..cedbfd2e9bff 100644
--- a/llvm/test/CodeGen/X86/fp-cvt.ll
+++ b/llvm/test/CodeGen/X86/fp-cvt.ll
@@ -1090,3 +1090,55 @@ define x86_fp80 @rint_fp80_ld(x86_fp80 *%a0) nounwind {
}
declare x86_fp80 @llvm.rint.f80(x86_fp80 %p)
+
+;
+; roundeven
+;
+
+define x86_fp80 @roundeven_fp80(x86_fp80 %a0) nounwind {
+; X86-LABEL: roundeven_fp80:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: calll roundevenl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: roundeven_fp80:
+; X64: # %bb.0:
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: callq roundevenl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+ %1 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %a0)
+ ret x86_fp80 %1
+}
+
+define x86_fp80 @roundeven_fp80_ld(x86_fp80 *%a0) nounwind {
+; X86-LABEL: roundeven_fp80_ld:
+; X86: # %bb.0:
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: fldt (%eax)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: calll roundevenl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: roundeven_fp80_ld:
+; X64: # %bb.0:
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt (%rdi)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: callq roundevenl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+ %1 = load x86_fp80, x86_fp80 *%a0
+ %2 = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %1)
+ ret x86_fp80 %2
+}
+
+declare x86_fp80 @llvm.roundeven.f80(x86_fp80 %p)
diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
new file mode 100644
index 000000000000..a3eae0137f3e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -0,0 +1,1044 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+
+define float @roundeven_f32(float %x) {
+; SSE2-LABEL: roundeven_f32:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: jmp _roundevenf ## TAILCALL
+;
+; SSE41-LABEL: roundeven_f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: jmp _roundevenf ## TAILCALL
+;
+; AVX-LABEL: roundeven_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: jmp _roundevenf ## TAILCALL
+ %a = call float @llvm.roundeven.f32(float %x)
+ ret float %a
+}
+
+define double @roundeven_f64(double %x) {
+; SSE2-LABEL: roundeven_f64:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: jmp _roundeven ## TAILCALL
+;
+; SSE41-LABEL: roundeven_f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: jmp _roundeven ## TAILCALL
+;
+; AVX-LABEL: roundeven_f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: jmp _roundeven ## TAILCALL
+ %a = call double @llvm.roundeven.f64(double %x)
+ ret double %a
+}
+
+define <4 x float> @roundeven_v4f32(<4 x float> %x) {
+; SSE2-LABEL: roundeven_v4f32:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $56, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 64
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addq $56, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v4f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $40, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 48
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addq $40, %rsp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: roundeven_v4f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
+ %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
+ ret <4 x float> %a
+}
+
+define <2 x double> @roundeven_v2f64(<2 x double> %x) {
+; SSE2-LABEL: roundeven_v2f64:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $40, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 48
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: addq $40, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v2f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $40, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 48
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, %xmm0
+; SSE41-NEXT: addq $40, %rsp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: roundeven_v2f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 48
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
+ %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
+ ret <2 x double> %a
+}
+
+define <8 x float> @roundeven_v8f32(<8 x float> %x) {
+; SSE2-LABEL: roundeven_v8f32:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $72, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 80
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: addq $72, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v8f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $56, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 64
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movshdup (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: addq $56, %rsp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: roundeven_v8f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: subq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 96
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX-NEXT: callq _roundevenf
+; AVX-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX-NEXT: addq $88, %rsp
+; AVX-NEXT: retq
+ %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x)
+ ret <8 x float> %a
+}
+
+define <4 x double> @roundeven_v4f64(<4 x double> %x) {
+; SSE2-LABEL: roundeven_v4f64:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $56, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 64
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: addq $56, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v4f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $56, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 64
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: addq $56, %rsp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: roundeven_v4f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: subq $88, %rsp
+; AVX-NEXT: .cfi_def_cfa_offset 96
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX-NEXT: ## xmm0 = mem[1,0]
+; AVX-NEXT: callq _roundeven
+; AVX-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX-NEXT: addq $88, %rsp
+; AVX-NEXT: retq
+ %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x)
+ ret <4 x double> %a
+}
+
+define <16 x float> @roundeven_v16f32(<16 x float> %x) {
+; SSE2-LABEL: roundeven_v16f32:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $104, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 112
+; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: callq _roundevenf
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload
+; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE2-NEXT: addq $104, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $88, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 96
+; SSE41-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movshdup (%rsp), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; SSE41-NEXT: ## xmm0 = mem[1,1,3,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: callq _roundevenf
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[0]
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE41-NEXT: addq $88, %rsp
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: roundeven_v16f32:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: subq $152, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 160
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX1-NEXT: callq _roundevenf
+; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: addq $152, %rsp
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: roundeven_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: subq $184, %rsp
+; AVX512-NEXT: .cfi_def_cfa_offset 192
+; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,1,3,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[3,1,2,3]
+; AVX512-NEXT: callq _roundevenf
+; AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload
+; AVX512-NEXT: addq $184, %rsp
+; AVX512-NEXT: retq
+ %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x)
+ ret <16 x float> %a
+}
+
+define <8 x double> @roundeven_v8f64(<8 x double> %x) {
+; SSE2-LABEL: roundeven_v8f64:
+; SSE2: ## %bb.0:
+; SSE2-NEXT: subq $88, %rsp
+; SSE2-NEXT: .cfi_def_cfa_offset 96
+; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT: callq _roundeven
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE2-NEXT: addq $88, %rsp
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: roundeven_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: subq $88, %rsp
+; SSE41-NEXT: .cfi_def_cfa_offset 96
+; SSE41-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE41-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE41-NEXT: callq _roundeven
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload
+; SSE41-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; SSE41-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload
+; SSE41-NEXT: addq $88, %rsp
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: roundeven_v8f64:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: subq $120, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 128
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX1-NEXT: ## xmm0 = mem[1,0]
+; AVX1-NEXT: callq _roundeven
+; AVX1-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 ## 16-byte Folded Reload
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 ## 32-byte Reload
+; AVX1-NEXT: addq $120, %rsp
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: roundeven_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: subq $184, %rsp
+; AVX512-NEXT: .cfi_def_cfa_offset 192
+; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 32-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovapd (%rsp), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) ## 16-byte Spill
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 ## 64-byte Reload
+; AVX512-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; AVX512-NEXT: ## xmm0 = mem[1,0]
+; AVX512-NEXT: callq _roundeven
+; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 ## 16-byte Folded Reload
+; AVX512-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 ## 32-byte Folded Reload
+; AVX512-NEXT: addq $184, %rsp
+; AVX512-NEXT: retq
+ %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x)
+ ret <8 x double> %a
+}
+
+declare float @llvm.roundeven.f32(float)
+declare double @llvm.roundeven.f64(double)
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>)
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>)
+declare <8 x float> @llvm.roundeven.v8f32(<8 x float>)
+declare <4 x double> @llvm.roundeven.v4f64(<4 x double>)
+declare <16 x float> @llvm.roundeven.v16f32(<16 x float>)
+declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
index da05e8be432e..f5a6af9c4d65 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round.ll
@@ -16,6 +16,10 @@ declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata)
declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata)
declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare float @llvm.experimental.constrained.round.f32(float, metadata)
+declare double @llvm.experimental.constrained.round.f64(double, metadata)
+declare float @llvm.experimental.constrained.roundeven.f32(float, metadata)
+declare double @llvm.experimental.constrained.roundeven.f64(double, metadata)
define float @fceil32(float %f) #0 {
; SSE41-X86-LABEL: fceil32:
@@ -491,4 +495,184 @@ define double @fnearbyintf64(double %f) #0 {
ret double %res
}
+define float @fround32(float %f) #0 {
+; SSE41-X86-LABEL: fround32:
+; SSE41-X86: # %bb.0:
+; SSE41-X86-NEXT: pushl %eax
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-X86-NEXT: movss %xmm0, (%esp)
+; SSE41-X86-NEXT: calll roundf
+; SSE41-X86-NEXT: popl %eax
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT: retl
+;
+; SSE41-X64-LABEL: fround32:
+; SSE41-X64: # %bb.0:
+; SSE41-X64-NEXT: pushq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT: callq roundf
+; SSE41-X64-NEXT: popq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT: retq
+;
+; AVX-X86-LABEL: fround32:
+; AVX-X86: # %bb.0:
+; AVX-X86-NEXT: pushl %eax
+; AVX-X86-NEXT: .cfi_def_cfa_offset 8
+; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-X86-NEXT: vmovss %xmm0, (%esp)
+; AVX-X86-NEXT: calll roundf
+; AVX-X86-NEXT: popl %eax
+; AVX-X86-NEXT: .cfi_def_cfa_offset 4
+; AVX-X86-NEXT: retl
+;
+; AVX-X64-LABEL: fround32:
+; AVX-X64: # %bb.0:
+; AVX-X64-NEXT: pushq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 16
+; AVX-X64-NEXT: callq roundf
+; AVX-X64-NEXT: popq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 8
+; AVX-X64-NEXT: retq
+ %res = call float @llvm.experimental.constrained.round.f32(
+ float %f, metadata !"fpexcept.strict") #0
+ ret float %res
+}
+
+define double @froundf64(double %f) #0 {
+; SSE41-X86-LABEL: froundf64:
+; SSE41-X86: # %bb.0:
+; SSE41-X86-NEXT: subl $8, %esp
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 12
+; SSE41-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-X86-NEXT: movsd %xmm0, (%esp)
+; SSE41-X86-NEXT: calll round
+; SSE41-X86-NEXT: addl $8, %esp
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT: retl
+;
+; SSE41-X64-LABEL: froundf64:
+; SSE41-X64: # %bb.0:
+; SSE41-X64-NEXT: pushq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT: callq round
+; SSE41-X64-NEXT: popq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT: retq
+;
+; AVX-X86-LABEL: froundf64:
+; AVX-X86: # %bb.0:
+; AVX-X86-NEXT: subl $8, %esp
+; AVX-X86-NEXT: .cfi_def_cfa_offset 12
+; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-X86-NEXT: vmovsd %xmm0, (%esp)
+; AVX-X86-NEXT: calll round
+; AVX-X86-NEXT: addl $8, %esp
+; AVX-X86-NEXT: .cfi_def_cfa_offset 4
+; AVX-X86-NEXT: retl
+;
+; AVX-X64-LABEL: froundf64:
+; AVX-X64: # %bb.0:
+; AVX-X64-NEXT: pushq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 16
+; AVX-X64-NEXT: callq round
+; AVX-X64-NEXT: popq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 8
+; AVX-X64-NEXT: retq
+ %res = call double @llvm.experimental.constrained.round.f64(
+ double %f, metadata !"fpexcept.strict") #0
+ ret double %res
+}
+
+define float @froundeven32(float %f) #0 {
+; SSE41-X86-LABEL: froundeven32:
+; SSE41-X86: # %bb.0:
+; SSE41-X86-NEXT: pushl %eax
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-X86-NEXT: movss %xmm0, (%esp)
+; SSE41-X86-NEXT: calll roundevenf
+; SSE41-X86-NEXT: popl %eax
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT: retl
+;
+; SSE41-X64-LABEL: froundeven32:
+; SSE41-X64: # %bb.0:
+; SSE41-X64-NEXT: pushq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT: callq roundevenf
+; SSE41-X64-NEXT: popq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT: retq
+;
+; AVX-X86-LABEL: froundeven32:
+; AVX-X86: # %bb.0:
+; AVX-X86-NEXT: pushl %eax
+; AVX-X86-NEXT: .cfi_def_cfa_offset 8
+; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-X86-NEXT: vmovss %xmm0, (%esp)
+; AVX-X86-NEXT: calll roundevenf
+; AVX-X86-NEXT: popl %eax
+; AVX-X86-NEXT: .cfi_def_cfa_offset 4
+; AVX-X86-NEXT: retl
+;
+; AVX-X64-LABEL: froundeven32:
+; AVX-X64: # %bb.0:
+; AVX-X64-NEXT: pushq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 16
+; AVX-X64-NEXT: callq roundevenf
+; AVX-X64-NEXT: popq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 8
+; AVX-X64-NEXT: retq
+ %res = call float @llvm.experimental.constrained.roundeven.f32(
+ float %f, metadata !"fpexcept.strict") #0
+ ret float %res
+}
+
+define double @froundevenf64(double %f) #0 {
+; SSE41-X86-LABEL: froundevenf64:
+; SSE41-X86: # %bb.0:
+; SSE41-X86-NEXT: subl $8, %esp
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 12
+; SSE41-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-X86-NEXT: movsd %xmm0, (%esp)
+; SSE41-X86-NEXT: calll roundeven
+; SSE41-X86-NEXT: addl $8, %esp
+; SSE41-X86-NEXT: .cfi_def_cfa_offset 4
+; SSE41-X86-NEXT: retl
+;
+; SSE41-X64-LABEL: froundevenf64:
+; SSE41-X64: # %bb.0:
+; SSE41-X64-NEXT: pushq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 16
+; SSE41-X64-NEXT: callq roundeven
+; SSE41-X64-NEXT: popq %rax
+; SSE41-X64-NEXT: .cfi_def_cfa_offset 8
+; SSE41-X64-NEXT: retq
+;
+; AVX-X86-LABEL: froundevenf64:
+; AVX-X86: # %bb.0:
+; AVX-X86-NEXT: subl $8, %esp
+; AVX-X86-NEXT: .cfi_def_cfa_offset 12
+; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-X86-NEXT: vmovsd %xmm0, (%esp)
+; AVX-X86-NEXT: calll roundeven
+; AVX-X86-NEXT: addl $8, %esp
+; AVX-X86-NEXT: .cfi_def_cfa_offset 4
+; AVX-X86-NEXT: retl
+;
+; AVX-X64-LABEL: froundevenf64:
+; AVX-X64: # %bb.0:
+; AVX-X64-NEXT: pushq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 16
+; AVX-X64-NEXT: callq roundeven
+; AVX-X64-NEXT: popq %rax
+; AVX-X64-NEXT: .cfi_def_cfa_offset 8
+; AVX-X64-NEXT: retq
+ %res = call double @llvm.experimental.constrained.roundeven.f64(
+ double %f, metadata !"fpexcept.strict") #0
+ ret double %res
+}
+
attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index b705c760287e..d2be7fb68900 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -913,6 +913,47 @@ entry:
ret fp128 %round
}
+define fp128 @roundeven(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: roundeven:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq roundevenl
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+;
+; X86-LABEL: roundeven:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $20, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %eax
+; X86-NEXT: calll roundevenl
+; X86-NEXT: addl $28, %esp
+; X86-NEXT: movl (%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, 8(%esi)
+; X86-NEXT: movl %edx, 12(%esi)
+; X86-NEXT: movl %eax, (%esi)
+; X86-NEXT: movl %ecx, 4(%esi)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl $20, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl $4
+entry:
+ %roundeven = call fp128 @llvm.experimental.constrained.roundeven.f128(fp128 %x, metadata !"fpexcept.strict") #0
+ ret fp128 %roundeven
+}
+
define fp128 @sin(fp128 %x) nounwind strictfp {
; CHECK-LABEL: sin:
; CHECK: # %bb.0: # %entry
@@ -1409,6 +1450,7 @@ declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, me
declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, metadata)
declare fp128 @llvm.experimental.constrained.rint.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata)
+declare fp128 @llvm.experimental.constrained.roundeven.f128(fp128, metadata)
declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
new file mode 100644
index 000000000000..c199352d1423
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -0,0 +1,657 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
+define x86_fp80 @fma(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z) nounwind strictfp {
+; X86-LABEL: fma:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $36, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll fmal
+; X86-NEXT: addl $36, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: fma:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq fmal
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: retq
+entry:
+ %fma = call x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80 %x, x86_fp80 %y, x86_fp80 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %fma
+}
+
+define x86_fp80 @frem(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: frem:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll fmodl
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: frem:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq fmodl
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+entry:
+ %div = call x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %div
+}
+
+define x86_fp80 @ceil(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: ceil:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll ceill
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: ceil:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq ceill
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %ceil = call x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %ceil
+}
+
+define x86_fp80 @cos(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: cos:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll cosl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: cos:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq cosl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %cos = call x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %cos
+}
+
+define x86_fp80 @exp(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: exp:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll expl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: exp:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq expl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %exp = call x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %exp
+}
+
+define x86_fp80 @exp2(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: exp2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll exp2l
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: exp2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq exp2l
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %exp2 = call x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %exp2
+}
+
+define x86_fp80 @floor(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: floor:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll floorl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: floor:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq floorl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %floor = call x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %floor
+}
+
+define x86_fp80 @log(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll logl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: log:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq logl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %log = call x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %log
+}
+
+define x86_fp80 @log10(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log10:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll log10l
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: log10:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq log10l
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %log10 = call x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %log10
+}
+
+define x86_fp80 @log2(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: log2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll log2l
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: log2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq log2l
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %log2 = call x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %log2
+}
+
+define x86_fp80 @maxnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: maxnum:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll fmaxl
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: maxnum:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq fmaxl
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+entry:
+ %maxnum = call x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %maxnum
+}
+
+define x86_fp80 @minnum(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: minnum:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll fminl
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: minnum:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq fminl
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+entry:
+ %minnum = call x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80 %x, x86_fp80 %y, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %minnum
+}
+
+define x86_fp80 @nearbyint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: nearbyint:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll nearbyintl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: nearbyint:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq nearbyintl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %nearbyint = call x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %nearbyint
+}
+
+define x86_fp80 @pow(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: pow:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $24, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll powl
+; X86-NEXT: addl $24, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: pow:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq powl
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+entry:
+ %pow = call x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %pow
+}
+
+define x86_fp80 @powi(x86_fp80 %x, i32 %y) nounwind strictfp {
+; X86-LABEL: powi:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: wait
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll __powixf2
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: powi:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq __powixf2
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %powi = call x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80 %x, i32 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %powi
+}
+
+define x86_fp80 @rint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: rint:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll rintl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: rint:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq rintl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %rint = call x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %rint
+}
+
+define x86_fp80 @round(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: round:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll roundl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: round:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq roundl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %round = call x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %round
+}
+
+define x86_fp80 @roundeven(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: roundeven:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll roundevenl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: roundeven:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq roundevenl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %roundeven = call x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %roundeven
+}
+
+define x86_fp80 @sin(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: sin:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll sinl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: sin:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq sinl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %sin = call x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret x86_fp80 %sin
+}
+
+define x86_fp80 @trunc(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: trunc:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll truncl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: trunc:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq truncl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %trunc = call x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret x86_fp80 %trunc
+}
+
+define i32 @lrint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: lrint:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll lrintl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: lrint:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq lrintl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %rint = call i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret i32 %rint
+}
+
+define i64 @llrint(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: llrint:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll llrintl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: llrint:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq llrintl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %rint = call i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+ ret i64 %rint
+}
+
+define i32 @lround(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: lround:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll lroundl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: lround:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq lroundl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %round = call i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret i32 %round
+}
+
+define i64 @llround(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: llround:
+; X86: # %bb.0: # %entry
+; X86-NEXT: subl $12, %esp
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fstpt (%esp)
+; X86-NEXT: wait
+; X86-NEXT: calll llroundl
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: llround:
+; X64: # %bb.0: # %entry
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: wait
+; X64-NEXT: callq llroundl
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+entry:
+ %round = call i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80 %x, metadata !"fpexcept.strict") #0
+ ret i64 %round
+}
+
+attributes #0 = { strictfp }
+
+declare x86_fp80 @llvm.experimental.constrained.fma.f80(x86_fp80, x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.frem.f80(x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.ceil.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.cos.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.exp.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.exp2.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.floor.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log10.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.log2.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(x86_fp80, x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.minnum.f80(x86_fp80, x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.pow.f80(x86_fp80, x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.powi.f80(x86_fp80, i32, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80, metadata)
+declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
+declare i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80, metadata, metadata)
+declare i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lround.i32.f80(x86_fp80, metadata)
+declare i64 @llvm.experimental.constrained.llround.i64.f80(x86_fp80, metadata)
More information about the llvm-commits
mailing list