[llvm] fd3e7e3 - [X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT (#90065)

via llvm-commits llvm-commits at lists.llvm.org
Thu May 2 18:31:31 PDT 2024


Author: Phoebe Wang
Date: 2024-05-03T09:31:27+08:00
New Revision: fd3e7e3a1e661482f46cd0347d0fa62adef30177

URL: https://github.com/llvm/llvm-project/commit/fd3e7e3a1e661482f46cd0347d0fa62adef30177
DIFF: https://github.com/llvm/llvm-project/commit/fd3e7e3a1e661482f46cd0347d0fa62adef30177.diff

LOG: [X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT (#90065)

- [V]CVTP[D,S]2DQ supports `f64/f32` -> `i32` conversions that can be
mapped to `llvm.lrint.vNi32.vNf64/32` since SSE2. AVX and AVX512 added
256-bit and 512-bit support;
- VCVTP[D,S]2QQ supports `f64/f32` -> `i64` conversions that can be
mapped to `llvm.l[l]rint.vNi64.vNf64/32` since AVX512DQ. All 128-bit,
256-bit (require AVX512VL) and 512-bit are supported.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrAVX512.td
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/test/CodeGen/X86/vector-llrint.ll
    llvm/test/CodeGen/X86/vector-lrint.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6a5fc3c5314656..9ac3f6d4445e4f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
+    setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
+
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1431,6 +1433,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FMINIMUM,          VT, Custom);
     }
 
+    setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
+    setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
+
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
@@ -1731,6 +1736,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
+  if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
+    for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+      setOperationAction(ISD::LRINT, VT, Legal);
+      setOperationAction(ISD::LLRINT, VT, Legal);
+    }
+  }
 
   // This block controls legalization for 512-bit operations with 8/16/32/64 bit
   // elements. 512-bits can be disabled based on prefer-vector-width and
@@ -1765,6 +1776,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FMA, VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
     }
+    setOperationAction(ISD::LRINT, MVT::v16f32,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    setOperationAction(ISD::LRINT, MVT::v8f64,
+                       Subtarget.hasDQI() ? Legal : Custom);
+    if (Subtarget.hasDQI())
+      setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
@@ -2488,6 +2505,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FMAXNUM,
                        ISD::SUB,
                        ISD::LOAD,
+                       ISD::LRINT,
+                       ISD::LLRINT,
                        ISD::MLOAD,
                        ISD::STORE,
                        ISD::MSTORE,
@@ -21161,8 +21180,12 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
+  EVT DstVT = Op.getSimpleValueType();
   MVT SrcVT = Src.getSimpleValueType();
 
+  if (SrcVT.isVector())
+    return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
+
   if (SrcVT == MVT::f16)
     return SDValue();
 
@@ -51542,6 +51565,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  SDLoc DL(N);
+
+  if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
+      SrcVT != MVT::v2f32)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+                                 DAG.getUNDEF(SrcVT)));
+}
+
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -51888,6 +51927,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
+  // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
+  if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
+      Src.hasOneUse())
+    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
   return SDValue();
 }
 
@@ -56834,6 +56878,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
     return combineUIntToFP(N, DAG, Subtarget);
+  case ISD::LRINT:
+  case ISD::LLRINT:         return combineLRINT_LLRINT(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case X86ISD::VFCMULC:

diff  --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 43a40f5e691ea3..ec2a5f52a7b6aa 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in {
   def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
                           v4i32x_info.ImmAllZerosV, VK2WM:$mask),
             (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>;
 }
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;
+def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>;
+def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>;
+def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>;
 
 let Predicates = [HasDQI, HasVLX] in {
   def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
@@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in {
                                  (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
                                  v2i64x_info.ImmAllZerosV)),
             (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+  def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+  def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+  def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+  def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
+  def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+  def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 063b572761e7d1..bc15085f6c7b7c 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
                      Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
 
-
 // Convert Packed Double FP to Packed DW Integers
 let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
 }
 
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>;
+  def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>;
+  def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>;
+}
+
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",

diff  --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 46904f82fd5d6d..7017eb60df41d6 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,289 +1,674 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX512DQ
 
 define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
-; X64-SSE-LABEL: llrint_v1i64_v1f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v1i64_v1f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f32:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; AVX-LABEL: llrint_v1i64_v1f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v1i64_v1f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512DQ-NEXT:    retq
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
   ret <1 x i64> %a
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
 
 define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
-; X64-SSE-LABEL: llrint_v2i64_v2f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v2i64_v2f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v2i64_v2f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vcvtss2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v2i64_v2f32:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v2i64_v2f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>)
 
 define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
-; X64-SSE-LABEL: llrint_v4i64_v4f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    movaps %xmm0, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-SSE-NEXT:    movaps %xmm0, %xmm1
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v4i64_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v4i64_v4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v4i64_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v4i64_v4f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %xmm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>)
 
 define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
-; X64-SSE-LABEL: llrint_v8i64_v8f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps %xmm0, %xmm2
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    movaps %xmm2, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; X64-SSE-NEXT:    movaps %xmm2, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm4
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    movaps %xmm1, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-SSE-NEXT:    movaps %xmm1, %xmm3
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm5
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; X64-SSE-NEXT:    movdqa %xmm4, %xmm1
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v8i64_v8f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    cvtss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm2, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm1[3,3]
+; SSE-NEXT:    cvtss2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm5
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v8i64_v8f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vmovaps %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v8i64_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v8i64_v8f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
   ret <8 x i64> %a
 }
 declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
 
 define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
-; X64-SSE-LABEL: llrint_v16i64_v16f32:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movq %rdi, %rax
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm4
-; X64-SSE-NEXT:    movaps %xmm0, %xmm5
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm5, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; X64-SSE-NEXT:    movaps %xmm0, %xmm5
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm5, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm0, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm5
-; X64-SSE-NEXT:    movaps %xmm1, %xmm6
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm6, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; X64-SSE-NEXT:    movaps %xmm1, %xmm6
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm6, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm1, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm1
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm6
-; X64-SSE-NEXT:    movaps %xmm2, %xmm7
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm7, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; X64-SSE-NEXT:    movaps %xmm2, %xmm7
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm7, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm2, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm2
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm7
-; X64-SSE-NEXT:    movaps %xmm3, %xmm8
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm8, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm8
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
-; X64-SSE-NEXT:    movaps %xmm3, %xmm8
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3]
-; X64-SSE-NEXT:    cvtss2si %xmm8, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm8
-; X64-SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
-; X64-SSE-NEXT:    cvtss2si %xmm3, %rcx
-; X64-SSE-NEXT:    movq %rcx, %xmm3
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
-; X64-SSE-NEXT:    movdqa %xmm3, 112(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm7, 96(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm2, 80(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm6, 64(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm1, 48(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm5, 32(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm0, 16(%rdi)
-; X64-SSE-NEXT:    movdqa %xmm4, (%rdi)
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v16i64_v16f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    cvtss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %xmm4
+; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm5, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE-NEXT:    movaps %xmm0, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3]
+; SSE-NEXT:    cvtss2si %xmm5, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE-NEXT:    cvtss2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %xmm5
+; SSE-NEXT:    movaps %xmm1, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm6, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; SSE-NEXT:    movaps %xmm1, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,3],xmm1[3,3]
+; SSE-NEXT:    cvtss2si %xmm6, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtss2si %xmm1, %rcx
+; SSE-NEXT:    movq %rcx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm6[0]
+; SSE-NEXT:    cvtss2si %xmm2, %rcx
+; SSE-NEXT:    movq %rcx, %xmm6
+; SSE-NEXT:    movaps %xmm2, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,1],xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm7, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE-NEXT:    movaps %xmm2, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,3],xmm2[3,3]
+; SSE-NEXT:    cvtss2si %xmm7, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtss2si %xmm2, %rcx
+; SSE-NEXT:    movq %rcx, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE-NEXT:    cvtss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rcx, %xmm7
+; SSE-NEXT:    movaps %xmm3, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,1],xmm3[1,1]
+; SSE-NEXT:    cvtss2si %xmm8, %rcx
+; SSE-NEXT:    movq %rcx, %xmm8
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; SSE-NEXT:    movaps %xmm3, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,3],xmm3[3,3]
+; SSE-NEXT:    cvtss2si %xmm8, %rcx
+; SSE-NEXT:    movq %rcx, %xmm8
+; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT:    cvtss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rcx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0]
+; SSE-NEXT:    movdqa %xmm3, 112(%rdi)
+; SSE-NEXT:    movdqa %xmm7, 96(%rdi)
+; SSE-NEXT:    movdqa %xmm2, 80(%rdi)
+; SSE-NEXT:    movdqa %xmm6, 64(%rdi)
+; SSE-NEXT:    movdqa %xmm1, 48(%rdi)
+; SSE-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE-NEXT:    movdqa %xmm4, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v16i64_v16f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps %ymm0, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm4, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm4, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm4
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm5, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm3, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-NEXT:    vcvtss2si %xmm5, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm5
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vcvtss2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; AVX1-NEXT:    vmovaps %ymm4, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v16i64_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm2
+; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX512-NEXT:    vcvtss2si %xmm4, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm4
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v16i64_v16f32:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm2
+; AVX512DQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT:    vcvtps2qq %ymm0, %zmm1
+; AVX512DQ-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
   ret <16 x i64> %a
 }
 declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
 
 define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
-; X64-SSE-LABEL: llrint_v1i64_v1f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v1i64_v1f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v1i64_v1f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v1i64_v1f64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v1i64_v1f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512DQ-NEXT:    retq
   %a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
   ret <1 x i64> %a
 }
 declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
 
 define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
-; X64-SSE-LABEL: llrint_v2i64_v2f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm1
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v2i64_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: llrint_v2i64_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
 ;
-; X64-AVX-LABEL: llrint_v2i64_v2f64:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-NEXT:    retq
+; AVX512DQ-LABEL: llrint_v2i64_v2f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %xmm0, %xmm0
+; AVX512DQ-NEXT:    retq
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
 }
 declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
 
 define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
-; X64-SSE-LABEL: llrint_v4i64_v4f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm2
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm3
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v4i64_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v4i64_v4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v4i64_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v4i64_v4f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %ymm0, %ymm0
+; AVX512DQ-NEXT:    retq
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
 }
 declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
 
 define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
-; X64-SSE-LABEL: llrint_v8i64_v8f64:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm4
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm0, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm5
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm1, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm6
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm2, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
-; X64-SSE-NEXT:    cvtsd2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm7
-; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
-; X64-SSE-NEXT:    cvtsd2si %xmm3, %rax
-; X64-SSE-NEXT:    movq %rax, %xmm0
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
-; X64-SSE-NEXT:    movdqa %xmm4, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm5, %xmm1
-; X64-SSE-NEXT:    movdqa %xmm6, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm7, %xmm3
-; X64-SSE-NEXT:    retq
+; SSE-LABEL: llrint_v8i64_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvtsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm5
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    cvtsd2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm6
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    cvtsd2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm0[0]
+; SSE-NEXT:    cvtsd2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm7
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT:    cvtsd2si %xmm3, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: llrint_v8i64_v8f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm2
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    retq
+;
+; AVX512-LABEL: llrint_v8i64_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm3
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    retq
+;
+; AVX512DQ-LABEL: llrint_v8i64_v8f64:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vcvtpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
   ret <8 x i64> %a
 }

diff  --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index f527a3584f4470..3612205bf1bfa9 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -1,11 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefix=X86-SSE2
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx | FileCheck %s --check-prefixes=X86-AVX,X86-AVX1
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,X86-AVX512
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=i686-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX,AVX512-i32
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX1-i32
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,X64-AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i32,AVX512-i32
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX1-i64
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,X64-AVX512-i64
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512-i64
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=X64-AVX-i64,AVX512DQ-i64
 
 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v1f32:
@@ -35,64 +38,43 @@ declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
 define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v2f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: lrint_v2f32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v2f32:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
-; X64-AVX-i64-LABEL: lrint_v2f32:
-; X64-AVX-i64:       # %bb.0:
-; X64-AVX-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-i64-NEXT:    retq
+; X64-AVX1-i64-LABEL: lrint_v2f32:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm1
+; X64-AVX1-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX1-i64-NEXT:    vcvtss2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v2f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v2f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %xmm0, %xmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
   ret <2 x iXLen> %a
 }
@@ -101,53 +83,17 @@ declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v4f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: lrint_v4f32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v4f32:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
 ; X64-AVX1-i64-LABEL: lrint_v4f32:
@@ -168,23 +114,28 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v4f32:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v4f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v4f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %xmm0, %ymm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
   ret <4 x iXLen> %a
 }
@@ -193,152 +144,19 @@ declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; X86-SSE2-LABEL: lrint_v8f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps %xmm0, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm0, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    movaps %xmm2, %xmm3
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm3, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm3
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; X86-SSE2-NEXT:    movaps %xmm1, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3]
-; X86-SSE2-NEXT:    cvtss2si %xmm2, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    movaps %xmm1, %xmm3
-; X86-SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm3, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm3
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm2
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; X86-SSE2-NEXT:    cvtss2si %xmm1, %eax
-; X86-SSE2-NEXT:    movd %eax, %xmm1
-; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    cvtps2dq %xmm0, %xmm0
+; X86-SSE2-NEXT:    cvtps2dq %xmm1, %xmm1
 ; X86-SSE2-NEXT:    retl
 ;
-; X86-AVX1-LABEL: lrint_v8f32:
-; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtss2si %xmm1, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX1-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX512-LABEL: lrint_v8f32:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtss2si %xmm1, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X86-AVX512-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm1, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtss2si %xmm0, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtss2si %xmm3, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X86-AVX512-NEXT:    vcvtss2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX512-NEXT:    retl
-;
-; X64-AVX1-i32-LABEL: lrint_v8f32:
-; X64-AVX1-i32:       # %bb.0:
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm1, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX1-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX1-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX1-i32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX1-i32-NEXT:    retq
+; X86-AVX-LABEL: lrint_v8f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vcvtps2dq %ymm0, %ymm0
+; X86-AVX-NEXT:    retl
 ;
-; X64-AVX512-i32-LABEL: lrint_v8f32:
-; X64-AVX512-i32:       # %bb.0:
-; X64-AVX512-i32-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm1, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm1, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX512-i32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm0, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm3, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; X64-AVX512-i32-NEXT:    vcvtss2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX512-i32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i32-NEXT:    retq
+; X64-AVX-i32-LABEL: lrint_v8f32:
+; X64-AVX-i32:       # %bb.0:
+; X64-AVX-i32-NEXT:    vcvtps2dq %ymm0, %ymm0
+; X64-AVX-i32-NEXT:    retq
 ;
 ; X64-AVX1-i64-LABEL: lrint_v8f32:
 ; X64-AVX1-i64:       # %bb.0:
@@ -374,39 +192,44 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; X64-AVX1-i64-NEXT:    vmovaps %ymm2, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v8f32:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; X64-AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v8f32:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-i64-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm3, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-i64-NEXT:    vcvtss2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v8f32:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtps2qq %ymm0, %zmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
   ret <8 x iXLen> %a
 }
@@ -473,15 +296,30 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 ; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
 ; X64-AVX-i32-NEXT:    retq
 ;
-; X64-AVX-i64-LABEL: lrint_v2f64:
-; X64-AVX-i64:       # %bb.0:
-; X64-AVX-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-AVX-i64-NEXT:    retq
+; X64-AVX1-i64-LABEL: lrint_v2f64:
+; X64-AVX1-i64:       # %bb.0:
+; X64-AVX1-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm1
+; X64-AVX1-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-AVX1-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; X64-AVX1-i64-NEXT:    vmovq %rax, %xmm0
+; X64-AVX1-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-AVX1-i64-NEXT:    retq
+;
+; AVX512-i64-LABEL: lrint_v2f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v2f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %xmm0, %xmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
   ret <2 x iXLen> %a
 }
@@ -508,33 +346,13 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ;
 ; X86-AVX-LABEL: lrint_v4f64:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X86-AVX-NEXT:    vcvtpd2dq %ymm0, %xmm0
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-i32-LABEL: lrint_v4f64:
 ; X64-AVX-i32:       # %bb.0:
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; X64-AVX-i32-NEXT:    vcvtpd2dq %ymm0, %xmm0
 ; X64-AVX-i32-NEXT:    vzeroupper
 ; X64-AVX-i32-NEXT:    retq
 ;
@@ -556,23 +374,28 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v4f64:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v4f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v4f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %ymm0, %ymm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
   ret <4 x iXLen> %a
 }
@@ -623,114 +446,23 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ;
 ; X86-AVX1-LABEL: lrint_v8f64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm1, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX1-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX1-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; X86-AVX1-NEXT:    vcvtpd2dq %ymm1, %xmm1
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: lrint_v8f64:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtsd2si %xmm1, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm1
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm2, %eax
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %ecx
-; X86-AVX512-NEXT:    vmovd %ecx, %xmm2
-; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X86-AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X86-AVX512-NEXT:    vcvtsd2si %xmm0, %eax
-; X86-AVX512-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X86-AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-AVX512-NEXT:    retl
+; AVX512-i32-LABEL: lrint_v8f64:
+; AVX512-i32:       # %bb.0:
+; AVX512-i32-NEXT:    vcvtpd2dq %zmm0, %ymm0
+; AVX512-i32-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX1-i32-LABEL: lrint_v8f64:
 ; X64-AVX1-i32:       # %bb.0:
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm1, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX1-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX1-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX1-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX1-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX1-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
+; X64-AVX1-i32-NEXT:    vcvtpd2dq %ymm0, %xmm0
+; X64-AVX1-i32-NEXT:    vcvtpd2dq %ymm1, %xmm1
 ; X64-AVX1-i32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-i32-NEXT:    retq
 ;
-; X64-AVX512-i32-LABEL: lrint_v8f64:
-; X64-AVX512-i32:       # %bb.0:
-; X64-AVX512-i32-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm1, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm2, %eax
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %ecx
-; X64-AVX512-i32-NEXT:    vmovd %ecx, %xmm2
-; X64-AVX512-i32-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; X64-AVX512-i32-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i32-NEXT:    vcvtsd2si %xmm0, %eax
-; X64-AVX512-i32-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; X64-AVX512-i32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX512-i32-NEXT:    retq
-;
 ; X64-AVX1-i64-LABEL: lrint_v8f64:
 ; X64-AVX1-i64:       # %bb.0:
 ; X64-AVX1-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -763,39 +495,44 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; X64-AVX1-i64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; X64-AVX1-i64-NEXT:    retq
 ;
-; X64-AVX512-i64-LABEL: lrint_v8f64:
-; X64-AVX512-i64:       # %bb.0:
-; X64-AVX512-i64-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm1
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; X64-AVX512-i64-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; X64-AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm2
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm3
-; X64-AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; X64-AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
-; X64-AVX512-i64-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; X64-AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; X64-AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; X64-AVX512-i64-NEXT:    retq
+; AVX512-i64-LABEL: lrint_v8f64:
+; AVX512-i64:       # %bb.0:
+; AVX512-i64-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm1, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm1
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-i64-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm2
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm3
+; AVX512-i64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-i64-NEXT:    vcvtsd2si %xmm0, %rax
+; AVX512-i64-NEXT:    vmovq %rax, %xmm0
+; AVX512-i64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX512-i64-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-i64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-i64-NEXT:    retq
+;
+; AVX512DQ-i64-LABEL: lrint_v8f64:
+; AVX512DQ-i64:       # %bb.0:
+; AVX512DQ-i64-NEXT:    vcvtpd2qq %zmm0, %zmm0
+; AVX512DQ-i64-NEXT:    retq
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
   ret <8 x iXLen> %a
 }


        


More information about the llvm-commits mailing list