[llvm] r335037 - [X86] VRNDSCALE* folding from masked and scalar ffloor and fceil patterns
Artur Pilipenko via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 27 08:28:52 PDT 2018
Yes, we already found the fix.
When I glanced through the recent changes I didn’t spot a revert or anything which looks obviously related.
Artur
On 27 Jun 2018, at 18:25, Craig Topper <craig.topper at gmail.com<mailto:craig.topper at gmail.com>> wrote:
I think this is PR37879 which was already fixed. Are you on the an earlier revision?
On Wed, Jun 27, 2018 at 5:21 AM Artur Pilipenko via llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>> wrote:
This change is causing failures in our downstream testing. We see something like:
LLVM ERROR: Cannot select: 0x8f7cd0: v1i1 = scalar_to_vector 0x8f7c00
0x8f7c00: i64,ch = CopyFromReg 0x85acf8, Register:i64 %11
0x8f7e70: i64 = Register %11
Any ideas on what this might be?
I’ll try to come up with a smaller reproducer, but we might need to revert the change for the time being.
Artur
> On 19 Jun 2018, at 13:37, Mikhail Dvoretckii via llvm-commits <llvm-commits at lists.llvm.org<mailto:llvm-commits at lists.llvm.org>> wrote:
>
> Author: mike.dvoretsky
> Date: Tue Jun 19 03:37:52 2018
> New Revision: 335037
>
> URL: http://llvm.org/viewvc/llvm-project?rev=335037&view=rev
> Log:
> [X86] VRNDSCALE* folding from masked and scalar ffloor and fceil patterns
>
> This patch handles back-end folding of generic patterns created by lowering the
> X86 rounding intrinsics to native IR in cases where the instruction isn't a
> straightforward packed values rounding operation, but a masked operation or a
> scalar operation.
>
> Differential Revision: https://reviews.llvm.org/D45203
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/lib/Target/X86/X86InstrAVX512.td
> llvm/trunk/lib/Target/X86/X86InstrSSE.td
> llvm/trunk/test/CodeGen/X86/vec_floor.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=335037&r1=335036&r2=335037&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Jun 19 03:37:52 2018
> @@ -39121,9 +39121,31 @@ static SDValue combineScalarToVector(SDN
> // TODO: SimplifyDemandedBits instead?
> if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
> if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
> - if (C->getAPIntValue().isOneValue())
> - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
> - Src.getOperand(0));
> + if (C->getAPIntValue().isOneValue()) {
> + SDValue Mask = Src.getOperand(0);
> + if (Mask.getOpcode() == ISD::TRUNCATE &&
> + Mask.getOperand(0).getValueType() != MVT::i16)
> + Mask = Mask.getOperand(0);
> + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask);
> + }
> +
> + // The result of AND may also be truncated. This occurs in code for lowered
> + // masked scalar intrinsics.
> + if (VT == MVT::v1i1 && Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
> + Src.getOperand(0).getOpcode() == ISD::AND &&
> + Src.getOperand(0).hasOneUse())
> + if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(0).getOperand(1)))
> + if (C->getAPIntValue().isOneValue()) {
> + SDValue Mask = Src.getOperand(0).getOperand(0);
> + if (Mask.getOpcode() == ISD::TRUNCATE &&
> + Mask.getOperand(0).getValueType() != MVT::i16)
> + Mask = Mask.getOperand(0);
> + // Check if the initial value is an i16. scalar_to_vector fails to
> + // select for that type, so the combine should be aborted.
> + if (Mask.getValueType() == MVT::i16)
> + return SDValue();
> + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask);
> + }
>
> return SDValue();
> }
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=335037&r1=335036&r2=335037&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Tue Jun 19 03:37:52 2018
> @@ -8781,16 +8781,50 @@ multiclass avx512_masked_scalar_imm<SDNo
> def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
> (OpNode (extractelt _.VT:$src2, (iPTR 0))),
> (extractelt _.VT:$dst, (iPTR 0))))),
> - (!cast<Instruction>("V"#OpcPrefix#r_Intk)
> + (!cast<Instruction>("V"#OpcPrefix#Zr_Intk)
> _.VT:$dst, OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
>
> def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
> (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
> - (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
> + (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz)
> OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
> }
> }
>
> +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
> + (v1i1 (scalar_to_vector GR32:$mask)),
> + v4f32x_info, fp32imm0, 0x01,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss,
> + (v1i1 (scalar_to_vector GR8:$mask)),
> + v4f32x_info, fp32imm0, 0x01,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
> + (v1i1 (scalar_to_vector GR32:$mask)),
> + v4f32x_info, fp32imm0, 0x02,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss,
> + (v1i1 (scalar_to_vector GR8:$mask)),
> + v4f32x_info, fp32imm0, 0x02,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
> + (v1i1 (scalar_to_vector GR32:$mask)),
> + v2f64x_info, fp64imm0, 0x01,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd,
> + (v1i1 (scalar_to_vector GR8:$mask)),
> + v2f64x_info, fp64imm0, 0x01,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
> + (v1i1 (scalar_to_vector GR32:$mask)),
> + v2f64x_info, fp64imm0, 0x02,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd,
> + (v1i1 (scalar_to_vector GR8:$mask)),
> + v2f64x_info, fp64imm0, 0x02,
> + (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
> +
> +
> //-------------------------------------------------
> // Integer truncate and extend operations
> //-------------------------------------------------
> @@ -9936,10 +9970,18 @@ defm VGETMANTSS: avx512_common_fp_sae_sc
> let Predicates = [HasAVX512] in {
> def : Pat<(v16f32 (ffloor VR512:$src)),
> (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
> +def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), VR512:$dst)),
> + (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0x9))>;
> +def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), v16f32_info.ImmAllZerosV)),
> + (VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0x9))>;
> def : Pat<(v16f32 (fnearbyint VR512:$src)),
> (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
> def : Pat<(v16f32 (fceil VR512:$src)),
> (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
> +def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), VR512:$dst)),
> + (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0xA))>;
> +def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), v16f32_info.ImmAllZerosV)),
> + (VRNDSCALEPSZrrikz VK16WM:$mask, VR512:$src, (i32 0xA))>;
> def : Pat<(v16f32 (frint VR512:$src)),
> (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
> def : Pat<(v16f32 (ftrunc VR512:$src)),
> @@ -9958,10 +10000,18 @@ def : Pat<(v16f32 (ftrunc (loadv16f32 ad
>
> def : Pat<(v8f64 (ffloor VR512:$src)),
> (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
> +def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), VR512:$dst)),
> + (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0x9))>;
> +def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), v8f64_info.ImmAllZerosV)),
> + (VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0x9))>;
> def : Pat<(v8f64 (fnearbyint VR512:$src)),
> (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
> def : Pat<(v8f64 (fceil VR512:$src)),
> (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
> +def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), VR512:$dst)),
> + (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0xA))>;
> +def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), v8f64_info.ImmAllZerosV)),
> + (VRNDSCALEPDZrrikz VK8WM:$mask, VR512:$src, (i32 0xA))>;
> def : Pat<(v8f64 (frint VR512:$src)),
> (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
> def : Pat<(v8f64 (ftrunc VR512:$src)),
> @@ -9982,10 +10032,18 @@ def : Pat<(v8f64 (ftrunc (loadv8f64 addr
> let Predicates = [HasVLX] in {
> def : Pat<(v4f32 (ffloor VR128X:$src)),
> (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
> +def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
> + (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0x9))>;
> +def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), v4f32x_info.ImmAllZerosV)),
> + (VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0x9))>;
> def : Pat<(v4f32 (fnearbyint VR128X:$src)),
> (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
> def : Pat<(v4f32 (fceil VR128X:$src)),
> (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
> +def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
> + (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0xA))>;
> +def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), v4f32x_info.ImmAllZerosV)),
> + (VRNDSCALEPSZ128rrikz VK4WM:$mask, VR128X:$src, (i32 0xA))>;
> def : Pat<(v4f32 (frint VR128X:$src)),
> (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
> def : Pat<(v4f32 (ftrunc VR128X:$src)),
> @@ -10004,10 +10062,18 @@ def : Pat<(v4f32 (ftrunc (loadv4f32 addr
>
> def : Pat<(v2f64 (ffloor VR128X:$src)),
> (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
> +def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), VR128X:$dst)),
> + (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0x9))>;
> +def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), v2f64x_info.ImmAllZerosV)),
> + (VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0x9))>;
> def : Pat<(v2f64 (fnearbyint VR128X:$src)),
> (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
> def : Pat<(v2f64 (fceil VR128X:$src)),
> (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
> +def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), VR128X:$dst)),
> + (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0xA))>;
> +def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), v2f64x_info.ImmAllZerosV)),
> + (VRNDSCALEPDZ128rrikz VK2WM:$mask, VR128X:$src, (i32 0xA))>;
> def : Pat<(v2f64 (frint VR128X:$src)),
> (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
> def : Pat<(v2f64 (ftrunc VR128X:$src)),
> @@ -10026,10 +10092,18 @@ def : Pat<(v2f64 (ftrunc (loadv2f64 addr
>
> def : Pat<(v8f32 (ffloor VR256X:$src)),
> (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
> +def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
> + (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0x9))>;
> +def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), v8f32x_info.ImmAllZerosV)),
> + (VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0x9))>;
> def : Pat<(v8f32 (fnearbyint VR256X:$src)),
> (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
> def : Pat<(v8f32 (fceil VR256X:$src)),
> (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
> +def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
> + (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0xA))>;
> +def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), v8f32x_info.ImmAllZerosV)),
> + (VRNDSCALEPSZ256rrikz VK8WM:$mask, VR256X:$src, (i32 0xA))>;
> def : Pat<(v8f32 (frint VR256X:$src)),
> (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
> def : Pat<(v8f32 (ftrunc VR256X:$src)),
> @@ -10048,10 +10122,18 @@ def : Pat<(v8f32 (ftrunc (loadv8f32 addr
>
> def : Pat<(v4f64 (ffloor VR256X:$src)),
> (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
> +def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), VR256X:$dst)),
> + (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0x9))>;
> +def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), v4f64x_info.ImmAllZerosV)),
> + (VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0x9))>;
> def : Pat<(v4f64 (fnearbyint VR256X:$src)),
> (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
> def : Pat<(v4f64 (fceil VR256X:$src)),
> (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
> +def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), VR256X:$dst)),
> + (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0xA))>;
> +def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), v4f64x_info.ImmAllZerosV)),
> + (VRNDSCALEPDZ256rrikz VK4WM:$mask, VR256X:$src, (i32 0xA))>;
> def : Pat<(v4f64 (frint VR256X:$src)),
> (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
> def : Pat<(v4f64 (ftrunc VR256X:$src)),
>
> Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=335037&r1=335036&r2=335037&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
> +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Jun 19 03:37:52 2018
> @@ -5944,6 +5944,15 @@ let Predicates = [UseSSE41] in {
> (ROUNDPDm addr:$src, (i32 0xB))>;
> }
>
> +defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss,
> + v4f32, 0x01, UseSSE41>;
> +defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss,
> + v4f32, 0x02, UseSSE41>;
> +defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd,
> + v2f64, 0x01, UseSSE41>;
> +defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd,
> + v2f64, 0x02, UseSSE41>;
> +
> //===----------------------------------------------------------------------===//
> // SSE4.1 - Packed Bit Test
> //===----------------------------------------------------------------------===//
>
> Modified: llvm/trunk/test/CodeGen/X86/vec_floor.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_floor.ll?rev=335037&r1=335036&r2=335037&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/vec_floor.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/vec_floor.ll Tue Jun 19 03:37:52 2018
> @@ -1,7 +1,8 @@
> ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
> ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
> ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
> -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512
> +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
> +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
>
> define <2 x double> @floor_v2f64(<2 x double> %p) {
> ; SSE41-LABEL: floor_v2f64:
> @@ -770,3 +771,2135 @@ define <4 x float> @const_trunc_v4f32()
> %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
> ret <4 x float> %t
> }
> +
> +;
> +; Scalar and masked instructions
> +;
> +
> +define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: floor_ss:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundss $1, %xmm0, %xmm1
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_ss:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vroundss $1, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_ss:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vroundss $1, %xmm0, %xmm1, %xmm0
> +; AVX512-NEXT: retq
> + %s = extractelement <4 x float> %x, i32 0
> + %call = call float @llvm.floor.f32(float %s)
> + %res = insertelement <4 x float> %y, float %call, i32 0
> + ret <4 x float> %res
> +}
> +declare float @llvm.floor.f32(float %s)
> +
> +define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: floor_sd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundsd $1, %xmm0, %xmm1
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_sd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vroundsd $1, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_sd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vroundsd $1, %xmm0, %xmm1, %xmm0
> +; AVX512-NEXT: retq
> + %s = extractelement <2 x double> %x, i32 0
> + %call = call double @llvm.floor.f64(double %s)
> + %res = insertelement <2 x double> %y, double %call, i32 0
> + ret <2 x double> %res
> +}
> +declare double @llvm.floor.f64(double %s)
> +
> +define <4 x float> @floor_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: floor_mask_128_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_128_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vroundps $9, %xmm0, %xmm0
> +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_128_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
> +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_128_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm1 {%k1}
> +; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x float> %x, %y
> + %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
> + %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
> + ret <4 x float> %res
> +}
> +
> +define <4 x float> @floor_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_128_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqps %xmm0, %xmm1
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm0
> +; SSE41-NEXT: andps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_128_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vroundps $9, %xmm0, %xmm0
> +; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_128_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $9, %xmm0, %xmm0
> +; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_128_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $9, %xmm0, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x float> %x, %y
> + %call = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)
> + %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
> + ret <4 x float> %res
> +}
> +
> +define <2 x double> @floor_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: floor_mask_128_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_128_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
> +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_128_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
> +; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_128_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm1 {%k1}
> +; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <2 x double> %x, %y
> + %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
> + %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
> + ret <2 x double> %res
> +}
> +
> +define <2 x double> @floor_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_128_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
> +; SSE41-NEXT: andpd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_128_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
> +; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_128_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $9, %xmm0, %xmm0
> +; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_128_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $9, %xmm0, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <2 x double> %x, %y
> + %call = call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
> + %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
> + ret <2 x double> %res
> +}
> +
> +define <8 x float> @floor_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
> +; SSE41-LABEL: floor_mask_256_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundps $9, %xmm1, %xmm4
> +; SSE41-NEXT: cmpeqps %xmm3, %xmm1
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm5
> +; SSE41-NEXT: cmpeqps %xmm2, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
> +; SSE41-NEXT: movaps %xmm2, %xmm0
> +; SSE41-NEXT: movaps %xmm3, %xmm1
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_256_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
> +; AVX-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_256_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_256_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm1 {%k1}
> +; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <8 x float> %x, %y
> + %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
> + %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
> + ret <8 x float> %res
> +}
> +
> +define <8 x float> @floor_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_256_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm3
> +; SSE41-NEXT: cmpeqps %xmm0, %xmm2
> +; SSE41-NEXT: roundps $9, %xmm1, %xmm1
> +; SSE41-NEXT: andps %xmm3, %xmm1
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm0
> +; SSE41-NEXT: andps %xmm2, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_256_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
> +; AVX-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_256_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_256_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $9, %ymm0, %ymm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <8 x float> %x, %y
> + %call = call <8 x float> @llvm.floor.v8f32(<8 x float> %x)
> + %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
> + ret <8 x float> %res
> +}
> +
> +define <4 x double> @floor_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
> +; SSE41-LABEL: floor_mask_256_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundpd $9, %xmm1, %xmm4
> +; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm5
> +; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3
> +; SSE41-NEXT: movapd %xmm2, %xmm0
> +; SSE41-NEXT: movapd %xmm3, %xmm1
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_256_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
> +; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_256_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_256_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm1 {%k1}
> +; AVX512VL-NEXT: vmovapd %ymm1, %ymm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x double> %x, %y
> + %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
> + %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y
> + ret <4 x double> %res
> +}
> +
> +define <4 x double> @floor_maskz_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_256_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
> +; SSE41-NEXT: cmpeqpd %xmm0, %xmm2
> +; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
> +; SSE41-NEXT: andpd %xmm3, %xmm1
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
> +; SSE41-NEXT: andpd %xmm2, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_256_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1
> +; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX-NEXT: vandpd %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_256_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_256_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $9, %ymm0, %ymm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x double> %x, %y
> + %call = call <4 x double> @llvm.floor.v4f64(<4 x double> %x)
> + %res = select <4 x i1> %k, <4 x double> %call, <4 x double> zeroinitializer
> + ret <4 x double> %res
> +}
> +
> +define <16 x float> @floor_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
> +; SSE41-LABEL: floor_mask_512_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundps $9, %xmm3, %xmm8
> +; SSE41-NEXT: cmpeqps %xmm7, %xmm3
> +; SSE41-NEXT: roundps $9, %xmm2, %xmm9
> +; SSE41-NEXT: cmpeqps %xmm6, %xmm2
> +; SSE41-NEXT: roundps $9, %xmm1, %xmm10
> +; SSE41-NEXT: cmpeqps %xmm5, %xmm1
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm11
> +; SSE41-NEXT: cmpeqps %xmm4, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5
> +; SSE41-NEXT: movaps %xmm2, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6
> +; SSE41-NEXT: movaps %xmm3, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7
> +; SSE41-NEXT: movaps %xmm4, %xmm0
> +; SSE41-NEXT: movaps %xmm5, %xmm1
> +; SSE41-NEXT: movaps %xmm6, %xmm2
> +; SSE41-NEXT: movaps %xmm7, %xmm3
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_512_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4
> +; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5
> +; AVX-NEXT: vroundps $9, %ymm1, %ymm1
> +; AVX-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0
> +; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_512_ps:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm1 {%k1}
> +; AVX512-NEXT: vmovaps %zmm1, %zmm0
> +; AVX512-NEXT: retq
> + %k = fcmp oeq <16 x float> %x, %y
> + %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
> + %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y
> + ret <16 x float> %res
> +}
> +
> +define <16 x float> @floor_maskz_512_ps(<16 x float> %x, <16 x float> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_512_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqps %xmm3, %xmm7
> +; SSE41-NEXT: cmpeqps %xmm2, %xmm6
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm5
> +; SSE41-NEXT: cmpeqps %xmm0, %xmm4
> +; SSE41-NEXT: roundps $9, %xmm3, %xmm3
> +; SSE41-NEXT: andps %xmm7, %xmm3
> +; SSE41-NEXT: roundps $9, %xmm2, %xmm2
> +; SSE41-NEXT: andps %xmm6, %xmm2
> +; SSE41-NEXT: roundps $9, %xmm1, %xmm1
> +; SSE41-NEXT: andps %xmm5, %xmm1
> +; SSE41-NEXT: roundps $9, %xmm0, %xmm0
> +; SSE41-NEXT: andps %xmm4, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_512_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm3
> +; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm2
> +; AVX-NEXT: vroundps $9, %ymm1, %ymm1
> +; AVX-NEXT: vandps %ymm1, %ymm3, %ymm1
> +; AVX-NEXT: vroundps $9, %ymm0, %ymm0
> +; AVX-NEXT: vandps %ymm0, %ymm2, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_512_ps:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %k = fcmp oeq <16 x float> %x, %y
> + %call = call <16 x float> @llvm.floor.v16f32(<16 x float> %x)
> + %res = select <16 x i1> %k, <16 x float> %call, <16 x float> zeroinitializer
> + ret <16 x float> %res
> +}
> +
> +define <8 x double> @floor_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
> +; SSE41-LABEL: floor_mask_512_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundpd $9, %xmm3, %xmm8
> +; SSE41-NEXT: cmpeqpd %xmm7, %xmm3
> +; SSE41-NEXT: roundpd $9, %xmm2, %xmm9
> +; SSE41-NEXT: cmpeqpd %xmm6, %xmm2
> +; SSE41-NEXT: roundpd $9, %xmm1, %xmm10
> +; SSE41-NEXT: cmpeqpd %xmm5, %xmm1
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm11
> +; SSE41-NEXT: cmpeqpd %xmm4, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5
> +; SSE41-NEXT: movapd %xmm2, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
> +; SSE41-NEXT: movapd %xmm3, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
> +; SSE41-NEXT: movapd %xmm4, %xmm0
> +; SSE41-NEXT: movapd %xmm5, %xmm1
> +; SSE41-NEXT: movapd %xmm6, %xmm2
> +; SSE41-NEXT: movapd %xmm7, %xmm3
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_512_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4
> +; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5
> +; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
> +; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
> +; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_512_pd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm1 {%k1}
> +; AVX512-NEXT: vmovapd %zmm1, %zmm0
> +; AVX512-NEXT: retq
> + %k = fcmp oeq <8 x double> %x, %y
> + %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
> + %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y
> + ret <8 x double> %res
> +}
> +
> +define <8 x double> @floor_maskz_512_pd(<8 x double> %x, <8 x double> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_512_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqpd %xmm3, %xmm7
> +; SSE41-NEXT: cmpeqpd %xmm2, %xmm6
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm5
> +; SSE41-NEXT: cmpeqpd %xmm0, %xmm4
> +; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
> +; SSE41-NEXT: andpd %xmm7, %xmm3
> +; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
> +; SSE41-NEXT: andpd %xmm6, %xmm2
> +; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
> +; SSE41-NEXT: andpd %xmm5, %xmm1
> +; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
> +; SSE41-NEXT: andpd %xmm4, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_512_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm3
> +; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm2
> +; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
> +; AVX-NEXT: vandpd %ymm1, %ymm3, %ymm1
> +; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
> +; AVX-NEXT: vandpd %ymm0, %ymm2, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_512_pd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %k = fcmp oeq <8 x double> %x, %y
> + %call = call <8 x double> @llvm.floor.v8f64(<8 x double> %x)
> + %res = select <8 x i1> %k, <8 x double> %call, <8 x double> zeroinitializer
> + ret <8 x double> %res
> +}
> +
> +define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i8 %k) nounwind {
> +; SSE41-LABEL: floor_mask_ss:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: je LBB50_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB50_2:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_ss:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: je LBB50_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB50_2:
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_ss:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512-NEXT: vmovaps %xmm2, %xmm0
> +; AVX512-NEXT: retq
> + %mask = and i8 %k, 1
> + %nmask = icmp eq i8 %mask, 0
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %dst = extractelement <4 x float> %w, i64 0
> + %low = select i1 %nmask, float %dst, float %call
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwind {
> +; SSE41-LABEL: floor_maskz_ss:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: je LBB51_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB51_2:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_ss:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
> +; AVX-NEXT: je LBB51_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB51_2:
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_ss:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %mask = and i8 %k, 1
> + %nmask = icmp eq i8 %mask, 0
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %low = select i1 %nmask, float zeroinitializer, float %call
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i8 %k) nounwind {
> +; SSE41-LABEL: floor_mask_sd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: je LBB52_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB52_2:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_sd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: je LBB52_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB52_2:
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_sd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512-NEXT: vmovapd %xmm2, %xmm0
> +; AVX512-NEXT: retq
> + %mask = and i8 %k, 1
> + %nmask = icmp eq i8 %mask, 0
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %dst = extractelement <2 x double> %w, i64 0
> + %low = select i1 %nmask, double %dst, double %call
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nounwind {
> +; SSE41-LABEL: floor_maskz_sd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: xorpd %xmm2, %xmm2
> +; SSE41-NEXT: je LBB53_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB53_2:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_sd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
> +; AVX-NEXT: je LBB53_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB53_2:
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_sd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %mask = and i8 %k, 1
> + %nmask = icmp eq i8 %mask, 0
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %low = select i1 %nmask, double zeroinitializer, double %call
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind {
> +; SSE41-LABEL: floor_mask_ss_trunc:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: je LBB54_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB54_2:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_ss_trunc:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: je LBB54_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB54_2:
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_ss_trunc:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512-NEXT: vmovaps %xmm2, %xmm0
> +; AVX512-NEXT: retq
> + %mask = trunc i16 %k to i1
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %dst = extractelement <4 x float> %w, i64 0
> + %low = select i1 %mask, float %call, float %dst
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k) nounwind {
> +; SSE41-LABEL: floor_maskz_ss_trunc:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: jne LBB55_1
> +; SSE41-NEXT: ## %bb.2:
> +; SSE41-NEXT: xorps %xmm0, %xmm0
> +; SSE41-NEXT: jmp LBB55_3
> +; SSE41-NEXT: LBB55_1:
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm0
> +; SSE41-NEXT: LBB55_3:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_ss_trunc:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: jne LBB55_1
> +; AVX-NEXT: ## %bb.2:
> +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +; AVX-NEXT: LBB55_1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_ss_trunc:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %mask = trunc i16 %k to i1
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %low = select i1 %mask, float %call, float zeroinitializer
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind {
> +; SSE41-LABEL: floor_mask_sd_trunc:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: je LBB56_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB56_2:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_sd_trunc:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: je LBB56_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB56_2:
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_mask_sd_trunc:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512-NEXT: vmovapd %xmm2, %xmm0
> +; AVX512-NEXT: retq
> + %mask = trunc i16 %k to i1
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %dst = extractelement <2 x double> %w, i64 0
> + %low = select i1 %mask, double %call, double %dst
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %k) nounwind {
> +; SSE41-LABEL: floor_maskz_sd_trunc:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: testb $1, %dil
> +; SSE41-NEXT: jne LBB57_1
> +; SSE41-NEXT: ## %bb.2:
> +; SSE41-NEXT: xorpd %xmm0, %xmm0
> +; SSE41-NEXT: jmp LBB57_3
> +; SSE41-NEXT: LBB57_1:
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
> +; SSE41-NEXT: LBB57_3:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_sd_trunc:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: testb $1, %dil
> +; AVX-NEXT: jne LBB57_1
> +; AVX-NEXT: ## %bb.2:
> +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
> +; AVX-NEXT: retq
> +; AVX-NEXT: LBB57_1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: floor_maskz_sd_trunc:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: kmovw %edi, %k1
> +; AVX512-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512-NEXT: retq
> + %mask = trunc i16 %k to i1
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %low = select i1 %mask, double %call, double zeroinitializer
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind {
> +; SSE41-LABEL: floor_mask_ss_mask8:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: movaps %xmm0, %xmm3
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm3
> +; SSE41-NEXT: pextrb $0, %xmm3, %eax
> +; SSE41-NEXT: testb $1, %al
> +; SSE41-NEXT: je LBB58_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB58_2:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_ss_mask8:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3
> +; AVX-NEXT: vpextrb $0, %xmm3, %eax
> +; AVX-NEXT: testb $1, %al
> +; AVX-NEXT: je LBB58_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB58_2:
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_ss_mask8:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512F-NEXT: vmovaps %xmm2, %xmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_ss_mask8:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512VL-NEXT: vmovaps %xmm2, %xmm0
> +; AVX512VL-NEXT: retq
> + %mask1 = fcmp oeq <4 x float> %x, %y
> + %mask = extractelement <4 x i1> %mask1, i64 0
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %dst = extractelement <4 x float> %w, i64 0
> + %low = select i1 %mask, float %call, float %dst
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_ss_mask8:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: movaps %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm2
> +; SSE41-NEXT: pextrb $0, %xmm2, %eax
> +; SSE41-NEXT: testb $1, %al
> +; SSE41-NEXT: jne LBB59_1
> +; SSE41-NEXT: ## %bb.2:
> +; SSE41-NEXT: xorps %xmm0, %xmm0
> +; SSE41-NEXT: jmp LBB59_3
> +; SSE41-NEXT: LBB59_1:
> +; SSE41-NEXT: roundss $9, %xmm0, %xmm0
> +; SSE41-NEXT: LBB59_3:
> +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_ss_mask8:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vpextrb $0, %xmm2, %eax
> +; AVX-NEXT: testb $1, %al
> +; AVX-NEXT: jne LBB59_1
> +; AVX-NEXT: ## %bb.2:
> +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +; AVX-NEXT: LBB59_1:
> +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_ss_mask8:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_ss_mask8:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaless $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %mask1 = fcmp oeq <4 x float> %x, %y
> + %mask = extractelement <4 x i1> %mask1, i64 0
> + %s = extractelement <4 x float> %x, i64 0
> + %call = tail call float @llvm.floor.f32(float %s)
> + %low = select i1 %mask, float %call, float zeroinitializer
> + %res = insertelement <4 x float> %y, float %low, i64 0
> + ret <4 x float> %res
> +}
> +
> +define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x double> %w) nounwind {
> +; SSE41-LABEL: floor_mask_sd_mask8:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: movapd %xmm0, %xmm3
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm3
> +; SSE41-NEXT: pextrb $0, %xmm3, %eax
> +; SSE41-NEXT: testb $1, %al
> +; SSE41-NEXT: je LBB60_2
> +; SSE41-NEXT: ## %bb.1:
> +; SSE41-NEXT: xorps %xmm2, %xmm2
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm2
> +; SSE41-NEXT: LBB60_2:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_mask_sd_mask8:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm3
> +; AVX-NEXT: vpextrb $0, %xmm3, %eax
> +; AVX-NEXT: testb $1, %al
> +; AVX-NEXT: je LBB60_2
> +; AVX-NEXT: ## %bb.1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm2
> +; AVX-NEXT: LBB60_2:
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_mask_sd_mask8:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512F-NEXT: vmovapd %xmm2, %xmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_mask_sd_mask8:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm2 {%k1}
> +; AVX512VL-NEXT: vmovapd %xmm2, %xmm0
> +; AVX512VL-NEXT: retq
> + %mask1 = fcmp oeq <2 x double> %x, %y
> + %mask = extractelement <2 x i1> %mask1, i64 0
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %dst = extractelement <2 x double> %w, i64 0
> + %low = select i1 %mask, double %call, double %dst
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: floor_maskz_sd_mask8:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: movapd %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm2
> +; SSE41-NEXT: pextrb $0, %xmm2, %eax
> +; SSE41-NEXT: testb $1, %al
> +; SSE41-NEXT: jne LBB61_1
> +; SSE41-NEXT: ## %bb.2:
> +; SSE41-NEXT: xorpd %xmm0, %xmm0
> +; SSE41-NEXT: jmp LBB61_3
> +; SSE41-NEXT: LBB61_1:
> +; SSE41-NEXT: roundsd $9, %xmm0, %xmm0
> +; SSE41-NEXT: LBB61_3:
> +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: floor_maskz_sd_mask8:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vpextrb $0, %xmm2, %eax
> +; AVX-NEXT: testb $1, %al
> +; AVX-NEXT: jne LBB61_1
> +; AVX-NEXT: ## %bb.2:
> +; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
> +; AVX-NEXT: retq
> +; AVX-NEXT: LBB61_1:
> +; AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
> +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: floor_maskz_sd_mask8:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: floor_maskz_sd_mask8:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalesd $1, %xmm0, %xmm1, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %mask1 = fcmp oeq <2 x double> %x, %y
> + %mask = extractelement <2 x i1> %mask1, i64 0
> + %s = extractelement <2 x double> %x, i64 0
> + %call = tail call double @llvm.floor.f64(double %s)
> + %low = select i1 %mask, double %call, double zeroinitializer
> + %res = insertelement <2 x double> %y, double %low, i64 0
> + ret <2 x double> %res
> +}
> +
> +define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: ceil_ss:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundss $2, %xmm0, %xmm1
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_ss:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: ceil_ss:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0
> +; AVX512-NEXT: retq
> + %s = extractelement <4 x float> %x, i32 0
> + %call = call float @llvm.ceil.f32(float %s)
> + %res = insertelement <4 x float> %y, float %call, i32 0
> + ret <4 x float> %res
> +}
> +declare float @llvm.ceil.f32(float %s)
> +
> +define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: ceil_sd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundsd $2, %xmm0, %xmm1
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_sd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512-LABEL: ceil_sd:
> +; AVX512: ## %bb.0:
> +; AVX512-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0
> +; AVX512-NEXT: retq
> + %s = extractelement <2 x double> %x, i32 0
> + %call = call double @llvm.ceil.f64(double %s)
> + %res = insertelement <2 x double> %y, double %call, i32 0
> + ret <2 x double> %res
> +}
> +declare double @llvm.ceil.f64(double %s)
> +
> +define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: ceil_mask_128_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundps $10, %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_mask_128_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vroundps $10, %xmm0, %xmm0
> +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_mask_128_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
> +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_mask_128_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm1 {%k1}
> +; AVX512VL-NEXT: vmovaps %xmm1, %xmm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x float> %x, %y
> + %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
> + %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y
> + ret <4 x float> %res
> +}
> +
> +define <4 x float> @ceil_maskz_128_ps(<4 x float> %x, <4 x float> %y) nounwind {
> +; SSE41-LABEL: ceil_maskz_128_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqps %xmm0, %xmm1
> +; SSE41-NEXT: roundps $10, %xmm0, %xmm0
> +; SSE41-NEXT: andps %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_maskz_128_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vroundps $10, %xmm0, %xmm0
> +; AVX-NEXT: vandps %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_maskz_128_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $10, %xmm0, %xmm0
> +; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_maskz_128_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $10, %xmm0, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <4 x float> %x, %y
> + %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
> + %res = select <4 x i1> %k, <4 x float> %call, <4 x float> zeroinitializer
> + ret <4 x float> %res
> +}
> +
> +define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: ceil_mask_128_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundpd $10, %xmm0, %xmm2
> +; SSE41-NEXT: cmpeqpd %xmm1, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
> +; SSE41-NEXT: movapd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_mask_128_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2
> +; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
> +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_mask_128_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
> +; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_mask_128_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm1 {%k1}
> +; AVX512VL-NEXT: vmovapd %xmm1, %xmm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <2 x double> %x, %y
> + %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
> + %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y
> + ret <2 x double> %res
> +}
> +
> +define <2 x double> @ceil_maskz_128_pd(<2 x double> %x, <2 x double> %y) nounwind {
> +; SSE41-LABEL: ceil_maskz_128_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqpd %xmm0, %xmm1
> +; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
> +; SSE41-NEXT: andpd %xmm1, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_maskz_128_pd:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1
> +; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
> +; AVX-NEXT: vandpd %xmm0, %xmm1, %xmm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_maskz_128_pd:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundpd $10, %xmm0, %xmm0
> +; AVX512F-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
> +; AVX512F-NEXT: vzeroupper
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_maskz_128_pd:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
> +; AVX512VL-NEXT: vrndscalepd $10, %xmm0, %xmm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <2 x double> %x, %y
> + %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x)
> + %res = select <2 x i1> %k, <2 x double> %call, <2 x double> zeroinitializer
> + ret <2 x double> %res
> +}
> +
> +define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
> +; SSE41-LABEL: ceil_mask_256_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundps $10, %xmm1, %xmm4
> +; SSE41-NEXT: cmpeqps %xmm3, %xmm1
> +; SSE41-NEXT: roundps $10, %xmm0, %xmm5
> +; SSE41-NEXT: cmpeqps %xmm2, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2
> +; SSE41-NEXT: movaps %xmm1, %xmm0
> +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3
> +; SSE41-NEXT: movaps %xmm2, %xmm0
> +; SSE41-NEXT: movaps %xmm3, %xmm1
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_mask_256_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2
> +; AVX-NEXT: vroundps $10, %ymm0, %ymm0
> +; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_mask_256_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
> +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_mask_256_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm1 {%k1}
> +; AVX512VL-NEXT: vmovaps %ymm1, %ymm0
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <8 x float> %x, %y
> + %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
> + %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y
> + ret <8 x float> %res
> +}
> +
> +define <8 x float> @ceil_maskz_256_ps(<8 x float> %x, <8 x float> %y) nounwind {
> +; SSE41-LABEL: ceil_maskz_256_ps:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: cmpeqps %xmm1, %xmm3
> +; SSE41-NEXT: cmpeqps %xmm0, %xmm2
> +; SSE41-NEXT: roundps $10, %xmm1, %xmm1
> +; SSE41-NEXT: andps %xmm3, %xmm1
> +; SSE41-NEXT: roundps $10, %xmm0, %xmm0
> +; SSE41-NEXT: andps %xmm2, %xmm0
> +; SSE41-NEXT: retq
> +;
> +; AVX-LABEL: ceil_maskz_256_ps:
> +; AVX: ## %bb.0:
> +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
> +; AVX-NEXT: vroundps $10, %ymm0, %ymm0
> +; AVX-NEXT: vandps %ymm0, %ymm1, %ymm0
> +; AVX-NEXT: retq
> +;
> +; AVX512F-LABEL: ceil_maskz_256_ps:
> +; AVX512F: ## %bb.0:
> +; AVX512F-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
> +; AVX512F-NEXT: vcmpeqps %zmm1, %zmm0, %k1
> +; AVX512F-NEXT: vroundps $10, %ymm0, %ymm0
> +; AVX512F-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
> +; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
> +; AVX512F-NEXT: retq
> +;
> +; AVX512VL-LABEL: ceil_maskz_256_ps:
> +; AVX512VL: ## %bb.0:
> +; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %k1
> +; AVX512VL-NEXT: vrndscaleps $10, %ymm0, %ymm0 {%k1} {z}
> +; AVX512VL-NEXT: retq
> + %k = fcmp oeq <8 x float> %x, %y
> + %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x)
> + %res = select <8 x i1> %k, <8 x float> %call, <8 x float> zeroinitializer
> + ret <8 x float> %res
> +}
> +
> +define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind {
> +; SSE41-LABEL: ceil_mask_256_pd:
> +; SSE41: ## %bb.0:
> +; SSE41-NEXT: roundpd $10, %xmm1, %xmm4
> +; SSE41-NEXT: cmpeqpd %xmm3, %xmm1
> +; SSE41-NEXT: roundpd $10, %xmm0, %xmm5
> +; SSE41-NEXT: cmpeqpd %xmm2, %xmm0
> +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2
> +; SSE41-NEXT: movapd %xmm1, %xmm0
--
~Craig
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180627/a31088d3/attachment-0001.html>
More information about the llvm-commits
mailing list