[llvm] [X86] Adding lowerings for vector ISD::LRINT and ISD::LLRINT (PR #90065)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 25 07:32:52 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Phoebe Wang (phoebewang)
<details>
<summary>Changes</summary>
---
Patch is 52.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90065.diff
6 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+51-1)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+2-1)
- (modified) llvm/lib/Target/X86/X86InstrAVX512.td (+35)
- (modified) llvm/lib/Target/X86/X86InstrSSE.td (+14-1)
- (modified) llvm/test/CodeGen/X86/vector-llrint.ll (+214-10)
- (modified) llvm/test/CodeGen/X86/vector-lrint.ll (+95-408)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb43cbe15f5225..827537818f059f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1092,6 +1092,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
+ setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
+
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
@@ -1431,6 +1433,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUM, VT, Custom);
}
+ setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
+
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
@@ -1731,6 +1735,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
+ if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
+ setOperationAction(ISD::LRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v4f32, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v8f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::LRINT, MVT::v4f64, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v2f64, Legal);
+ setOperationAction(ISD::LLRINT, MVT::v4f64, Legal);
+ }
// This block controls legalization for 512-bit operations with 8/16/32/64 bit
// elements. 512-bits can be disabled based on prefer-vector-width and
@@ -1765,6 +1779,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
+ setOperationAction(ISD::LRINT, MVT::v16f32,
+ Subtarget.hasDQI() ? Legal : Custom);
+ setOperationAction(ISD::LRINT, MVT::v8f64,
+ Subtarget.hasDQI() ? Legal : Custom);
+ if (Subtarget.hasDQI())
+ setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
@@ -2488,6 +2508,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FMAXNUM,
ISD::SUB,
ISD::LOAD,
+ ISD::LRINT,
+ ISD::LLRINT,
ISD::MLOAD,
ISD::STORE,
ISD::MSTORE,
@@ -21159,10 +21181,15 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ EVT DstVT = Op.getSimpleValueType();
MVT SrcVT = Src.getSimpleValueType();
+ if (SrcVT.isVector())
+ return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
+
if (SrcVT == MVT::f16)
return SDValue();
@@ -32217,7 +32244,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::LRINT:
- case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, Subtarget, DAG);
case ISD::SETCC:
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
@@ -51556,6 +51583,22 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ SDLoc DL(N);
+
+ if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
+ SrcVT != MVT::v2f32)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
+ DAG.getUNDEF(SrcVT)));
+}
+
/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
/// the codegen.
/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
@@ -51902,6 +51945,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
+ // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
+ if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
+ Src.hasOneUse())
+ return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
return SDValue();
}
@@ -56848,6 +56896,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UINT_TO_FP:
case ISD::STRICT_UINT_TO_FP:
return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::LRINT:
+ case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case X86ISD::VFCMULC:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e348ba6e8ac085..eea771d235b2da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1673,7 +1673,8 @@ namespace llvm {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLRINT_LLRINT(SDValue Op, const X86Subtarget &STI,
+ SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 43a40f5e691ea3..ec2a5f52a7b6aa 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8811,7 +8811,18 @@ let Predicates = [HasVLX] in {
def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
v4i32x_info.ImmAllZerosV, VK2WM:$mask),
(VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i32 (lrint VR128X:$src)), (VCVTPS2DQZ128rr VR128X:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQZ128rm addr:$src)>;
+ def : Pat<(v8i32 (lrint VR256X:$src)), (VCVTPS2DQZ256rr VR256X:$src)>;
+ def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQZ256rm addr:$src)>;
+ def : Pat<(v4i32 (lrint VR256X:$src)), (VCVTPD2DQZ256rr VR256X:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQZ256rm addr:$src)>;
}
+def : Pat<(v16i32 (lrint VR512:$src)), (VCVTPS2DQZrr VR512:$src)>;
+def : Pat<(v16i32 (lrint (loadv16f32 addr:$src))), (VCVTPS2DQZrm addr:$src)>;
+def : Pat<(v8i32 (lrint VR512:$src)), (VCVTPD2DQZrr VR512:$src)>;
+def : Pat<(v8i32 (lrint (loadv8f64 addr:$src))), (VCVTPD2DQZrm addr:$src)>;
let Predicates = [HasDQI, HasVLX] in {
def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
@@ -8857,6 +8868,30 @@ let Predicates = [HasDQI, HasVLX] in {
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
+
+ def : Pat<(v4i64 (lrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (lrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+ def : Pat<(v4i64 (llrint VR128X:$src)), (VCVTPS2QQZ256rr VR128X:$src)>;
+ def : Pat<(v4i64 (llrint (loadv4f32 addr:$src))), (VCVTPS2QQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (lrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (lrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+ def : Pat<(v4i64 (lrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (lrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+ def : Pat<(v2i64 (llrint VR128X:$src)), (VCVTPD2QQZ128rr VR128X:$src)>;
+ def : Pat<(v2i64 (llrint (loadv2f64 addr:$src))), (VCVTPD2QQZ128rm addr:$src)>;
+ def : Pat<(v4i64 (llrint VR256X:$src)), (VCVTPD2QQZ256rr VR256X:$src)>;
+ def : Pat<(v4i64 (llrint (loadv4f64 addr:$src))), (VCVTPD2QQZ256rm addr:$src)>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v8i64 (lrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (lrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (llrint VR256X:$src)), (VCVTPS2QQZrr VR256X:$src)>;
+ def : Pat<(v8i64 (llrint (loadv8f32 addr:$src))), (VCVTPS2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (lrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+ def : Pat<(v8i64 (lrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
+ def : Pat<(v8i64 (llrint VR512:$src)), (VCVTPD2QQZrr VR512:$src)>;
+ def : Pat<(v8i64 (llrint (loadv8f64 addr:$src))), (VCVTPD2QQZrm addr:$src)>;
}
let Predicates = [HasVLX] in {
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 063b572761e7d1..62b9b93953ad5a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1554,7 +1554,6 @@ def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
(v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
-
// Convert Packed Double FP to Packed DW Integers
let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
@@ -1586,6 +1585,20 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
}
+let Predicates = [HasAVX] in {
+ def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>;
+ def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>;
+ def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>;
+ def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>;
+ def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>;
+}
+
def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
(VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 46904f82fd5d6d..0be58ca86aa626 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64-SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512dq,avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX-512
define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; X64-SSE-LABEL: llrint_v1i64_v1f32:
@@ -9,10 +9,10 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; X64-SSE-NEXT: cvtss2si %xmm0, %rax
; X64-SSE-NEXT: retq
;
-; X64-AVX-LABEL: llrint_v1i64_v1f32:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
-; X64-AVX-NEXT: retq
+; CHECK-LABEL: llrint_v1i64_v1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtss2si %xmm0, %rax
+; CHECK-NEXT: retq
%a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
ret <1 x i64> %a
}
@@ -39,6 +39,11 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
; X64-AVX-NEXT: vmovq %rax, %xmm0
; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v2i64_v2f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %xmm0, %xmm0
+; X64-AVX-512-NEXT: retq
%a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
ret <2 x i64> %a
}
@@ -64,6 +69,29 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; X64-SSE-NEXT: movdqa %xmm2, %xmm0
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v4i64_v4f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v4i64_v4f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %xmm0, %ymm0
+; X64-AVX-512-NEXT: retq
%a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
ret <4 x i64> %a
}
@@ -105,6 +133,45 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
; X64-SSE-NEXT: movdqa %xmm4, %xmm1
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v8i64_v8f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
+; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-AVX-NEXT: vmovaps %ymm2, %ymm0
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v8i64_v8f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm0
+; X64-AVX-512-NEXT: retq
%a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
ret <8 x i64> %a
}
@@ -183,6 +250,78 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
; X64-SSE-NEXT: movdqa %xmm0, 16(%rdi)
; X64-SSE-NEXT: movdqa %xmm4, (%rdi)
; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: llrint_v16i64_v16f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm2
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; X64-AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm4, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm4
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm2, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; X64-AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm3, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; X64-AVX-NEXT: vcvtss2si %xmm5, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm3[0]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm5
+; X64-AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-AVX-NEXT: vcvtss2si %xmm1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; X64-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; X64-AVX-NEXT: vmovaps %ymm4, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX-512-LABEL: llrint_v16i64_v16f32:
+; X64-AVX-512: # %bb.0:
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm2
+; X64-AVX-512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; X64-AVX-512-NEXT: vcvtps2qq %ymm0, %zmm1
+; X64-AVX-512-NEXT: vmovaps %zmm2, %zmm0
+; X64-AVX-512-NEXT: retq
%a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
ret <16 x i64> %a
}
@@ -194,10 +333,10 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
; X64-SSE-NEXT: cvtsd2si %xmm0, %rax
; X64-SSE-NEXT: retq
;
-; X64-AVX-LABEL: llrint_v1i64_v1f64:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vcvtsd2si %xmm0, %rax
-; X64-AVX-NEXT: retq
+; CHECK-LABEL: llrint_v1i64_v1f64:
+; CHECK: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/90065
More information about the llvm-commits
mailing list