[llvm] e78156a - Scalarize the vector inputs to llvm.lround intrinsic by default. (#101054)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 21 10:13:59 PDT 2024
Author: Sumanth Gundapaneni
Date: 2024-08-21T12:13:56-05:00
New Revision: e78156a0e225673e592920410c8cadc94f19aa66
URL: https://github.com/llvm/llvm-project/commit/e78156a0e225673e592920410c8cadc94f19aa66
DIFF: https://github.com/llvm/llvm-project/commit/e78156a0e225673e592920410c8cadc94f19aa66.diff
LOG: Scalarize the vector inputs to llvm.lround intrinsic by default. (#101054)
Verifier is updated in a different patch to let the vector types for
llvm.lround and llvm.llround intrinsics.
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/test/CodeGen/AMDGPU/lround.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index bdbef20e20960d..3fece81df1f2fd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4921,6 +4921,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_INTRINSIC_LLRINT:
case G_INTRINSIC_ROUND:
case G_INTRINSIC_ROUNDEVEN:
+ case G_LROUND:
+ case G_LLROUND:
case G_INTRINSIC_TRUNC:
case G_FCOS:
case G_FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c9ab7e7a66079c..11935cbc309f01 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -507,7 +507,7 @@ namespace {
SDValue visitUINT_TO_FP(SDNode *N);
SDValue visitFP_TO_SINT(SDNode *N);
SDValue visitFP_TO_UINT(SDNode *N);
- SDValue visitXRINT(SDNode *N);
+ SDValue visitXROUND(SDNode *N);
SDValue visitFP_ROUND(SDNode *N);
SDValue visitFP_EXTEND(SDNode *N);
SDValue visitFNEG(SDNode *N);
@@ -1929,8 +1929,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
- case ISD::LLRINT: return visitXRINT(N);
+ case ISD::LLRINT: return visitXROUND(N);
case ISD::FP_ROUND: return visitFP_ROUND(N);
case ISD::FP_EXTEND: return visitFP_EXTEND(N);
case ISD::FNEG: return visitFNEG(N);
@@ -17998,15 +18000,17 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
return FoldIntToFPToInt(N, DAG);
}
-SDValue DAGCombiner::visitXRINT(SDNode *N) {
+SDValue DAGCombiner::visitXROUND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (lrint|llrint undef) -> undef
+ // fold (lround|llround undef) -> undef
if (N0.isUndef())
return DAG.getUNDEF(VT);
// fold (lrint|llrint c1fp) -> c1
+ // fold (lround|llround c1fp) -> c1
if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index ad0c054d3ccd50..221dcfe145594f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2441,6 +2441,8 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
case ISD::FCOPYSIGN: R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT: R = PromoteFloatOp_UnaryOp(N, OpNo); break;
case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 27dd4ae241bd10..1088db4bdbe0b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1052,7 +1052,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecRes_Convert(SDNode *N);
SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
- SDValue WidenVecRes_XRINT(SDNode *N);
+ SDValue WidenVecRes_XROUND(SDNode *N);
SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N);
SDValue WidenVecRes_ExpOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 57843f0959ac28..3f104baed97b1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -473,6 +473,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
Node->getValueType(0), Scale);
break;
}
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
case ISD::SINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index aad0047b4839a8..8315efcb6750f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -110,6 +110,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::LLRINT:
case ISD::FROUND:
case ISD::FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FSIN:
case ISD::FSINH:
case ISD::FSQRT:
@@ -752,6 +754,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::FP_TO_UINT:
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = ScalarizeVecOp_UnaryOp(N);
@@ -1215,6 +1219,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_FROUND:
case ISD::FROUNDEVEN:
case ISD::VP_FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FSIN:
case ISD::FSINH:
case ISD::FSQRT: case ISD::VP_SQRT:
@@ -3270,6 +3276,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::FTRUNC:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = SplitVecOp_UnaryOp(N);
@@ -4594,7 +4602,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::LLRINT:
case ISD::VP_LRINT:
case ISD::VP_LLRINT:
- Res = WidenVecRes_XRINT(N);
+ case ISD::LROUND:
+ case ISD::LLROUND:
+ Res = WidenVecRes_XROUND(N);
break;
case ISD::FABS:
@@ -5231,7 +5241,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
}
-SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_XROUND(SDNode *N) {
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
ElementCount WidenNumElts = WidenVT.getVectorElementCount();
@@ -6480,6 +6490,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break;
case ISD::FLDEXP:
case ISD::FCOPYSIGN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = WidenVecOp_UnrollVectorOp(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 18a3b7bce104a7..27675dce70c260 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5436,6 +5436,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::FCEIL:
case ISD::FROUND:
case ISD::FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FRINT:
case ISD::LRINT:
case ISD::LLRINT:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 4ff8617f740c89..35d6304cf9b400 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -774,8 +774,9 @@ void TargetLoweringBase::initActions() {
setOperationAction(
{ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
- ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN, ISD::FACOS,
- ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+ ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
+ ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
+ ISD::FCOSH, ISD::FSINH, ISD::FTANH},
VT, Expand);
// Constrained floating-point operations default to expand.
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index d45d83026013df..072ee70b840d83 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -6,94 +6,6 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
-declare float @llvm.round.f32(float)
-declare i32 @llvm.lround.i32.f32(float)
-declare i32 @llvm.lround.i32.f64(double)
-declare i64 @llvm.lround.i64.f32(float)
-declare i64 @llvm.lround.i64.f64(double)
-declare i64 @llvm.llround.i64.f32(float)
-declare half @llvm.round.f16(half)
-declare i32 @llvm.lround.i32.f16(half %arg)
-
-define float @intrinsic_fround(float %arg) {
-; GFX9-SDAG-LABEL: intrinsic_fround:
-; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-SDAG-NEXT: s_brev_b32 s4, -2
-; GFX9-SDAG-NEXT: v_bfi_b32 v0, s4, v2, v0
-; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: intrinsic_fround:
-; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v3, v2
-; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: intrinsic_fround:
-; GFX10-SDAG: ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX10-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: intrinsic_fround:
-; GFX10-GISEL: ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: intrinsic_fround:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: intrinsic_fround:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-entry:
- %res = tail call float @llvm.round.f32(float %arg)
- ret float %res
-}
-
define i32 @intrinsic_lround_i32_f32(float %arg) {
; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32:
; GFX9-SDAG: ; %bb.0: ; %entry
@@ -1034,3 +946,394 @@ entry:
ret i32 %res
}
+define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
+; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v3, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %res = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+ ret <2 x i32> %res
+}
+
+define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
+; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s7, 0x2f800000
+; GFX9-SDAG-NEXT: v_mul_f32_e64 v2, |v0|, s7
+; GFX9-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0xcf800000
+; GFX9-SDAG-NEXT: v_fma_f32 v3, v2, s8, |v0|
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v3, v4
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v5, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, |v3|, s7
+; GFX9-SDAG-NEXT: v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT: v_fma_f32 v5, v1, s8, |v3|
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v6, v1
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v4, vcc
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v5, v3
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v4, v6, v3
+; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x2f800000
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v5, |v2|, v3
+; GFX9-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xcf800000
+; GFX9-GISEL-NEXT: v_fma_f32 v2, v5, v6, |v2|
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v2, v7
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v7
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v5, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v8, v1, v5
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v8
+; GFX9-GISEL-NEXT: v_add_f32_e32 v4, v5, v1
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v4
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v3, |v1|, v3
+; GFX9-GISEL-NEXT: v_floor_f32_e32 v3, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v1, v3, v6, |v1|
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v1
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7
+; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v7, vcc
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v4
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v1
+; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX10-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX10-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX10-SDAG-NEXT: v_floor_f32_e32 v3, v3
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX10-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX10-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_floor_f32_e32 v4, v4
+; GFX10-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX10-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX10-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX10-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
+; GFX10-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX10-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-SDAG-NEXT: v_floor_f32_e32 v3, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_floor_f32_e32 v4, v4
+; GFX11-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX11-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
+; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %res = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+ ret <2 x i64> %res
+}
More information about the llvm-commits
mailing list