[llvm] Scalarize the vector inputs to llvm.lround intrinsic by default. (PR #101054)
Sumanth Gundapaneni via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 1 10:53:22 PDT 2024
https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/101054
>From 2169c487ff0dfd2375e221b1eb06d709c694cc41 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Mon, 29 Jul 2024 13:02:10 -0500
Subject: [PATCH 01/10] Scalarize the vector inputs to llvm.lround intrinsic by
default.
Verifier is updated in a different patch to let the vector types for
llvm.lround and llvm.llround intrinsics.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 6 +
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 +-
.../SelectionDAG/LegalizeFloatTypes.cpp | 2 +
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 2 +-
.../SelectionDAG/LegalizeVectorOps.cpp | 2 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 16 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +
llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 +-
llvm/test/CodeGen/AMDGPU/lround.ll | 507 ++++++++++++++++++
10 files changed, 546 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5b9cc5dfeeadb..063840636b24e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2063,6 +2063,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::roundeven:
ISD = ISD::FROUNDEVEN;
break;
+ case Intrinsic::lround:
+ ISD = ISD::LROUND;
+ break;
+ case Intrinsic::llround:
+ ISD = ISD::LLROUND;
+ break;
case Intrinsic::pow:
ISD = ISD::FPOW;
break;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 225ec19246231..b4eb0bd0d0cd6 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4920,6 +4920,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_INTRINSIC_LLRINT:
case G_INTRINSIC_ROUND:
case G_INTRINSIC_ROUNDEVEN:
+ case G_LROUND:
+ case G_LLROUND:
case G_INTRINSIC_TRUNC:
case G_FCOS:
case G_FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 060e66175d965..d6b17e528e967 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -506,7 +506,7 @@ namespace {
SDValue visitUINT_TO_FP(SDNode *N);
SDValue visitFP_TO_SINT(SDNode *N);
SDValue visitFP_TO_UINT(SDNode *N);
- SDValue visitXRINT(SDNode *N);
+ SDValue visitXROUND(SDNode *N);
SDValue visitFP_ROUND(SDNode *N);
SDValue visitFP_EXTEND(SDNode *N);
SDValue visitFNEG(SDNode *N);
@@ -1925,8 +1925,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
- case ISD::LLRINT: return visitXRINT(N);
+ case ISD::LLRINT: return visitXROUND(N);
case ISD::FP_ROUND: return visitFP_ROUND(N);
case ISD::FP_EXTEND: return visitFP_EXTEND(N);
case ISD::FNEG: return visitFNEG(N);
@@ -17856,15 +17858,17 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
return FoldIntToFPToInt(N, DAG);
}
-SDValue DAGCombiner::visitXRINT(SDNode *N) {
+SDValue DAGCombiner::visitXROUND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
// fold (lrint|llrint undef) -> undef
+ // fold (lround|llround undef) -> undef
if (N0.isUndef())
return DAG.getUNDEF(VT);
// fold (lrint|llrint c1fp) -> c1
+ // fold (lround|llround c1fp) -> c1
if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 41fcc9afe4e90..2d0fad388cb7a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2409,6 +2409,8 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
case ISD::FCOPYSIGN: R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT: R = PromoteFloatOp_UnaryOp(N, OpNo); break;
case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d4e61c8588901..d8d03ae04b669 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1046,7 +1046,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecRes_Convert(SDNode *N);
SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
- SDValue WidenVecRes_XRINT(SDNode *N);
+ SDValue WidenVecRes_XROUND(SDNode *N);
SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N);
SDValue WidenVecRes_ExpOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 57843f0959ac2..3f104baed97b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -473,6 +473,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
Node->getValueType(0), Scale);
break;
}
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
case ISD::SINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5672b611234b8..b318f5164cf75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -110,6 +110,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::LLRINT:
case ISD::FROUND:
case ISD::FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FSIN:
case ISD::FSINH:
case ISD::FSQRT:
@@ -750,6 +752,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::FP_TO_UINT:
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = ScalarizeVecOp_UnaryOp(N);
@@ -1197,6 +1201,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::VP_FROUND:
case ISD::FROUNDEVEN:
case ISD::VP_FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FSIN:
case ISD::FSINH:
case ISD::FSQRT: case ISD::VP_SQRT:
@@ -3197,6 +3203,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
case ISD::FTRUNC:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = SplitVecOp_UnaryOp(N);
@@ -4519,7 +4527,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::LLRINT:
case ISD::VP_LRINT:
case ISD::VP_LLRINT:
- Res = WidenVecRes_XRINT(N);
+ case ISD::LROUND:
+ case ISD::LLROUND:
+ Res = WidenVecRes_XROUND(N);
break;
case ISD::FABS:
@@ -5136,7 +5146,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
}
-SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_XROUND(SDNode *N) {
SDLoc dl(N);
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
ElementCount WidenNumElts = WidenVT.getVectorElementCount();
@@ -6385,6 +6395,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break;
case ISD::FLDEXP:
case ISD::FCOPYSIGN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::LRINT:
case ISD::LLRINT:
Res = WidenVecOp_UnrollVectorOp(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bbc44a4716405..a87297129f7ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5425,6 +5425,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
case ISD::FCEIL:
case ISD::FROUND:
case ISD::FROUNDEVEN:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::FRINT:
case ISD::LRINT:
case ISD::LLRINT:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6ca9955993d24..51ffb0b5001a4 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -768,8 +768,9 @@ void TargetLoweringBase::initActions() {
setOperationAction(
{ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
- ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN, ISD::FACOS,
- ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+ ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
+ ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
+ ISD::FCOSH, ISD::FSINH, ISD::FTANH},
VT, Expand);
// Constrained floating-point operations default to expand.
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index d45d83026013d..9210244f9108b 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -14,6 +14,9 @@ declare i64 @llvm.lround.i64.f64(double)
declare i64 @llvm.llround.i64.f32(float)
declare half @llvm.round.f16(half)
declare i32 @llvm.lround.i32.f16(half %arg)
+declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
define float @intrinsic_fround(float %arg) {
; GFX9-SDAG-LABEL: intrinsic_fround:
@@ -1034,3 +1037,507 @@ entry:
ret i32 %res
}
+define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
+; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v3, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+ ret <2 x float> %0
+}
+
+define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
+; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v3, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+ ret <2 x i32> %0
+}
+
+define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9-SDAG: ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
+; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
+; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s7, 0x2f800000
+; GFX9-SDAG-NEXT: v_mul_f32_e64 v2, |v0|, s7
+; GFX9-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX9-SDAG-NEXT: s_mov_b32 s8, 0xcf800000
+; GFX9-SDAG-NEXT: v_fma_f32 v3, v2, s8, |v0|
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v3, v4
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX9-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v5, v1
+; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX9-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX9-SDAG-NEXT: v_mul_f32_e64 v1, |v3|, s7
+; GFX9-SDAG-NEXT: v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT: v_fma_f32 v5, v1, s8, |v3|
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-SDAG-NEXT: v_cvt_u32_f32_e32 v6, v1
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v2, v4
+; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v4, vcc
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v5, v3
+; GFX9-SDAG-NEXT: v_xor_b32_e32 v4, v6, v3
+; GFX9-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9-GISEL: ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
+; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
+; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x2f800000
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v5, |v2|, v3
+; GFX9-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v6, 0xcf800000
+; GFX9-GISEL-NEXT: v_fma_f32 v2, v5, v6, |v2|
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v0
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v2, v7
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v7
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v5, v1
+; GFX9-GISEL-NEXT: v_sub_f32_e32 v8, v1, v5
+; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v8
+; GFX9-GISEL-NEXT: v_add_f32_e32 v4, v5, v1
+; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v4
+; GFX9-GISEL-NEXT: v_mul_f32_e64 v3, |v1|, v3
+; GFX9-GISEL-NEXT: v_floor_f32_e32 v3, v3
+; GFX9-GISEL-NEXT: v_fma_f32 v1, v3, v6, |v1|
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v1
+; GFX9-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v7
+; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v7, vcc
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v2, v5, v4
+; GFX9-GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX9-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10-SDAG: ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v1
+; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX10-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX10-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX10-SDAG-NEXT: v_floor_f32_e32 v3, v3
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX10-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX10-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX10-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX10-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX10-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10-GISEL: ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
+; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX10-GISEL-NEXT: v_floor_f32_e32 v4, v4
+; GFX10-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX10-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX10-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX10-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5
+; GFX10-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3
+; GFX10-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
+; GFX10-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX10-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX10-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11-SDAG: ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GFX11-SDAG-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_floor_f32_e32 v2, v2
+; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-SDAG-NEXT: v_floor_f32_e32 v3, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GFX11-SDAG-NEXT: v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v4
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v2, v2, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v3, v3, v6
+; GFX11-SDAG-NEXT: v_xor_b32_e32 v4, v0, v6
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-SDAG-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11-GISEL: ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
+; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-GISEL-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_floor_f32_e32 v4, v4
+; GFX11-GISEL-NEXT: v_floor_f32_e32 v5, v5
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT: v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX11-GISEL-NEXT: v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v4
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v5
+; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v4, v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_xor_b32_e32 v5, v0, v3
+; GFX11-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+ ret <2 x i64> %0
+}
>From 831e9d9173b6f20445552a93e8345b8cc62332b6 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Fri, 12 Jul 2024 14:10:48 -0500
Subject: [PATCH 02/10] llvm.lround: Update verifier to validate support of
vector types.
(cherry picked from commit 12ed1a477c451584f840978af1f34ba0c98d5215)
---
llvm/lib/CodeGen/MachineVerifier.cpp | 15 ++++-
llvm/lib/IR/Verifier.cpp | 17 +++++-
llvm/test/MachineVerifier/test_g_llround.mir | 16 +++--
llvm/test/MachineVerifier/test_g_lround.mir | 8 ++-
llvm/unittests/IR/IntrinsicsTest.cpp | 64 ++++++++++++++++++++
5 files changed, 109 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index d22fbe322ec36..6364ecea1897c 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2062,7 +2062,20 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
}
case TargetOpcode::G_LLROUND:
case TargetOpcode::G_LROUND: {
- verifyAllRegOpsScalar(*MI, *MRI);
+ LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+ LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+ if (!DstTy.isValid() || !SrcTy.isValid())
+ break;
+ if (SrcTy.isPointer() || DstTy.isPointer()) {
+ std::string Op = SrcTy.isPointer() ? "Source" : "Destination";
+ report(Twine(Op, " operand must not be a pointer type"), MI);
+ } else if (SrcTy.isScalar()) {
+ verifyAllRegOpsScalar(*MI, *MRI);
+ break;
+ } else if (SrcTy.isVector()) {
+ verifyVectorElementMatch(SrcTy, DstTy, MI);
+ break;
+ }
break;
}
case TargetOpcode::G_IS_FPCLASS: {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c5c407637cbf3..45dda0d7c337c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5971,8 +5971,21 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
case Intrinsic::llround: {
Type *ValTy = Call.getArgOperand(0)->getType();
Type *ResultTy = Call.getType();
- Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
- "Intrinsic does not support vectors", &Call);
+ Check(
+ ValTy->isFPOrFPVectorTy() && ResultTy->isIntOrIntVectorTy(),
+ "llvm.lround, llvm.llround: argument must be floating-point or vector "
+ "of floating-points, and result must be integer or vector of integers",
+ &Call);
+ Check(
+ ValTy->isVectorTy() == ResultTy->isVectorTy(),
+ "llvm.lround, llvm.llround: argument and result disagree on vector use",
+ &Call);
+ if (ValTy->isVectorTy()) {
+ Check(cast<VectorType>(ValTy)->getElementCount() ==
+ cast<VectorType>(ResultTy)->getElementCount(),
+ "llvm.lround, llvm.llround: argument must be same length as result",
+ &Call);
+ }
break;
}
case Intrinsic::bswap: {
diff --git a/llvm/test/MachineVerifier/test_g_llround.mir b/llvm/test/MachineVerifier/test_g_llround.mir
index 9a0f4a75acaf4..e69499b1150c1 100644
--- a/llvm/test/MachineVerifier/test_g_llround.mir
+++ b/llvm/test/MachineVerifier/test_g_llround.mir
@@ -14,10 +14,14 @@ body: |
%ptr:_(p0) = COPY $x0
%vector:_(<2 x s64>) = COPY $q0
- ; CHECK: Bad machine code: All register operands must have scalar types
- ; CHECK: instruction: %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
- %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
+ ; CHECK: Bad machine code: Source operand must not be a pointer type
+ ; CHECK: instruction: %no_ptrs:_(s32) = G_LLROUND %ptr:_(p0)
+ %no_ptrs:_(s32) = G_LLROUND %ptr:_(p0)
- ; CHECK: Bad machine code: All register operands must have scalar types
- ; CHECK: instruction: %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
- %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
+ ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
+ ; CHECK: instruction: %no_vectors:_(s32) = G_LLROUND %vector:_(<2 x s64>)
+ %no_vectors:_(s32) = G_LLROUND %vector:_(<2 x s64>)
+
+ ; CHECK: Bad machine code: operand types must preserve number of vector elements
+ ; CHECK: instruction: %inv_vectors:_(<3 x s32>) = G_LLROUND %vector:_(<2 x s64>)
+ %inv_vectors:_(<3 x s32>) = G_LLROUND %vector:_(<2 x s64>)
diff --git a/llvm/test/MachineVerifier/test_g_lround.mir b/llvm/test/MachineVerifier/test_g_lround.mir
index 69d5d4967de30..56f06f00049e7 100644
--- a/llvm/test/MachineVerifier/test_g_lround.mir
+++ b/llvm/test/MachineVerifier/test_g_lround.mir
@@ -14,10 +14,14 @@ body: |
%ptr:_(p0) = COPY $x0
%vector:_(<2 x s64>) = COPY $q0
- ; CHECK: Bad machine code: All register operands must have scalar types
+ ; CHECK: Bad machine code: Source operand must not be a pointer type
; CHECK: instruction: %no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
%no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
- ; CHECK: Bad machine code: All register operands must have scalar types
+ ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
; CHECK: instruction: %no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
%no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
+
+ ; CHECK: Bad machine code: operand types must preserve number of vector elements
+ ; CHECK: instruction: %inv_vectors:_(<3 x s32>) = G_LROUND %vector:_(<2 x s64>)
+ %inv_vectors:_(<3 x s32>) = G_LROUND %vector:_(<2 x s64>)
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 6f9e724c40326..14badaa0de980 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -12,6 +12,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
#include "gtest/gtest.h"
using namespace llvm;
@@ -106,4 +107,67 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
EXPECT_TRUE(Checker(*Intr));
}
}
+
+TEST(IntrinsicVerifierTest, LRound) {
+ LLVMContext C;
+ std::unique_ptr<Module> M = std::make_unique<Module>("M", C);
+ IRBuilder<> Builder(C);
+
+ using TypePair = std::pair<Type *, Type *>;
+ Type *Int32Ty = Type::getInt32Ty(C);
+ Type *Int64Ty = Type::getInt64Ty(C);
+ Type *HalfTy = Type::getHalfTy(C);
+ Type *FltTy = Type::getFloatTy(C);
+ Type *DblTy = Type::getDoubleTy(C);
+ auto Vec2xTy = [&](Type *ElemTy) {
+ return VectorType::get(ElemTy, ElementCount::getFixed(2));
+ };
+ Type *Vec2xInt32Ty = Vec2xTy(Int32Ty);
+ Type *Vec2xInt64Ty = Vec2xTy(Int64Ty);
+ Type *Vec2xFltTy = Vec2xTy(FltTy);
+
+ // Test Cases
+ // Validating only a limited set of possible combinations.
+ std::vector<TypePair> ValidTypes = {
+ {Int32Ty, FltTy}, {Int32Ty, DblTy}, {Int64Ty, FltTy},
+ {Int64Ty, DblTy}, {Int32Ty, HalfTy}, {Vec2xInt32Ty, Vec2xFltTy},
+ {Vec2xInt64Ty, Vec2xFltTy}};
+
+ // CreateIntrinsic errors out on invalid argument types.
+ std::vector<TypePair> InvalidTypes = {
+ {VectorType::get(Int32Ty, ElementCount::getFixed(3)), Vec2xFltTy}};
+
+ auto testIntrinsic = [&](TypePair types, Intrinsic::ID ID, bool expectValid) {
+ Function *F =
+ Function::Create(FunctionType::get(types.first, {types.second}, false),
+ Function::ExternalLinkage, "lround_fn", M.get());
+ BasicBlock *BB = BasicBlock::Create(C, "entry", F);
+ Builder.SetInsertPoint(BB);
+
+ Value *Arg = F->arg_begin();
+ Value *Result = Builder.CreateIntrinsic(types.first, ID, {Arg});
+ Builder.CreateRet(Result);
+
+ std::string Error;
+ raw_string_ostream ErrorOS(Error);
+ EXPECT_EQ(expectValid, !verifyFunction(*F, &ErrorOS));
+ if (!expectValid) {
+ EXPECT_TRUE(StringRef(ErrorOS.str())
+ .contains("llvm.lround, llvm.llround: argument must be "
+ "same length as result"));
+ }
+ };
+
+ // Run Valid Cases.
+ for (auto Types : ValidTypes) {
+ testIntrinsic(Types, Intrinsic::lround, true);
+ testIntrinsic(Types, Intrinsic::llround, true);
+ }
+
+ // Run Invalid Cases.
+ for (auto Types : InvalidTypes) {
+ testIntrinsic(Types, Intrinsic::lround, false);
+ testIntrinsic(Types, Intrinsic::llround, false);
+ }
+}
} // end namespace
>From 9b9dd207e27fde122b05072df25a20a4318bcdd2 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 16 Jul 2024 13:12:27 -0500
Subject: [PATCH 03/10] Update doc for llvm.lround
---
llvm/docs/LangRef.rst | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 474a86a02c2ed..98f8e750935cb 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16588,7 +16588,8 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.lround`` on any
-floating-point type. Not all targets support all types however.
+floating-point type or vector of floating-point type. Not all targets
+support all types however.
::
>From 9a607052157b557a77999c76cb4b88fe822e4079 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Fri, 26 Jul 2024 12:17:44 -0500
Subject: [PATCH 04/10] Addressed reviwer's comment.
---
llvm/lib/CodeGen/MachineVerifier.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6364ecea1897c..5e9bb4c27ffbd 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2067,7 +2067,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
if (!DstTy.isValid() || !SrcTy.isValid())
break;
if (SrcTy.isPointer() || DstTy.isPointer()) {
- std::string Op = SrcTy.isPointer() ? "Source" : "Destination";
+ StringRef Op = SrcTy.isPointer() ? "Source" : "Destination";
report(Twine(Op, " operand must not be a pointer type"), MI);
} else if (SrcTy.isScalar()) {
verifyAllRegOpsScalar(*MI, *MRI);
>From af54fde0c3f47dd25aaaf1875f295d599968fba8 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 30 Jul 2024 10:52:39 -0500
Subject: [PATCH 05/10] Updated the lit test.
---
llvm/test/CodeGen/AMDGPU/lround.ll | 204 -----------------------------
1 file changed, 204 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 9210244f9108b..36a2280d2acd1 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -6,97 +6,6 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
-declare float @llvm.round.f32(float)
-declare i32 @llvm.lround.i32.f32(float)
-declare i32 @llvm.lround.i32.f64(double)
-declare i64 @llvm.lround.i64.f32(float)
-declare i64 @llvm.lround.i64.f64(double)
-declare i64 @llvm.llround.i64.f32(float)
-declare half @llvm.round.f16(half)
-declare i32 @llvm.lround.i32.f16(half %arg)
-declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
-declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
-declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
-
-define float @intrinsic_fround(float %arg) {
-; GFX9-SDAG-LABEL: intrinsic_fround:
-; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX9-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-SDAG-NEXT: s_brev_b32 s4, -2
-; GFX9-SDAG-NEXT: v_bfi_b32 v0, s4, v2, v0
-; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: intrinsic_fround:
-; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX9-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v3, v2
-; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: intrinsic_fround:
-; GFX10-SDAG: ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX10-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: intrinsic_fround:
-; GFX10-GISEL: ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX10-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: intrinsic_fround:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_trunc_f32_e32 v1, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: intrinsic_fround:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_trunc_f32_e32 v1, v0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-entry:
- %res = tail call float @llvm.round.f32(float %arg)
- ret float %res
-}
-
define i32 @intrinsic_lround_i32_f32(float %arg) {
; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32:
; GFX9-SDAG: ; %bb.0: ; %entry
@@ -1037,119 +946,6 @@ entry:
ret i32 %res
}
-define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
-; GFX9-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v0
-; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v0, v2
-; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-SDAG-NEXT: s_brev_b32 s6, -2
-; GFX9-SDAG-NEXT: v_bfi_b32 v0, s6, v3, v0
-; GFX9-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX9-SDAG-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-SDAG-NEXT: v_sub_f32_e32 v3, v1, v2
-; GFX9-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-SDAG-NEXT: v_bfi_b32 v1, s6, v3, v1
-; GFX9-SDAG-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v0
-; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v0, v2
-; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-GISEL-NEXT: v_and_or_b32 v0, v0, v4, v3
-; GFX9-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX9-GISEL-NEXT: v_trunc_f32_e32 v2, v1
-; GFX9-GISEL-NEXT: v_sub_f32_e32 v3, v1, v2
-; GFX9-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-GISEL-NEXT: v_and_or_b32 v1, v1, v4, v3
-; GFX9-GISEL-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX10-SDAG: ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_trunc_f32_e32 v2, v0
-; GFX10-SDAG-NEXT: v_trunc_f32_e32 v3, v1
-; GFX10-SDAG-NEXT: v_sub_f32_e32 v4, v0, v2
-; GFX10-SDAG-NEXT: v_sub_f32_e32 v5, v1, v3
-; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-SDAG-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX10-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX10-GISEL: ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_trunc_f32_e32 v2, v0
-; GFX10-GISEL-NEXT: v_trunc_f32_e32 v3, v1
-; GFX10-GISEL-NEXT: v_sub_f32_e32 v4, v0, v2
-; GFX10-GISEL-NEXT: v_sub_f32_e32 v5, v1, v3
-; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-GISEL-NEXT: v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-GISEL-NEXT: v_add_f32_e32 v0, v2, v0
-; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX10-GISEL-NEXT: v_add_f32_e32 v1, v3, v1
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_trunc_f32_e32 v2, v0
-; GFX11-SDAG-NEXT: v_trunc_f32_e32 v3, v1
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-SDAG-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX11-SDAG-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, v0
-; GFX11-GISEL-NEXT: v_trunc_f32_e32 v3, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-GISEL-NEXT: v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX11-GISEL-NEXT: v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-entry:
- %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
- ret <2 x float> %0
-}
-
define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
; GFX9-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
; GFX9-SDAG: ; %bb.0: ; %entry
>From cc37d4e6fbc8806fb39a3d2b9da46436d9ec90c6 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 30 Jul 2024 12:51:03 -0500
Subject: [PATCH 06/10] Revert "Addressed reviwer's comment."
This reverts commit 9a607052157b557a77999c76cb4b88fe822e4079.
---
llvm/lib/CodeGen/MachineVerifier.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 5e9bb4c27ffbd..6364ecea1897c 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2067,7 +2067,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
if (!DstTy.isValid() || !SrcTy.isValid())
break;
if (SrcTy.isPointer() || DstTy.isPointer()) {
- StringRef Op = SrcTy.isPointer() ? "Source" : "Destination";
+ std::string Op = SrcTy.isPointer() ? "Source" : "Destination";
report(Twine(Op, " operand must not be a pointer type"), MI);
} else if (SrcTy.isScalar()) {
verifyAllRegOpsScalar(*MI, *MRI);
>From 9a26c0f390b5f372a619b6eb3d5d7d567de50ee0 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 30 Jul 2024 12:51:08 -0500
Subject: [PATCH 07/10] Revert "Update doc for llvm.lround"
This reverts commit 9b9dd207e27fde122b05072df25a20a4318bcdd2.
---
llvm/docs/LangRef.rst | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 98f8e750935cb..474a86a02c2ed 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16588,8 +16588,7 @@ Syntax:
"""""""
This is an overloaded intrinsic. You can use ``llvm.lround`` on any
-floating-point type or vector of floating-point type. Not all targets
-support all types however.
+floating-point type. Not all targets support all types however.
::
>From a16fc240f3c05c7a1085e7a89b862c04118f29ed Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 30 Jul 2024 12:52:42 -0500
Subject: [PATCH 08/10] Revert "llvm.lround: Update verifier to validate
support of vector types."
This reverts commit 831e9d9173b6f20445552a93e8345b8cc62332b6.
---
llvm/lib/CodeGen/MachineVerifier.cpp | 15 +----
llvm/lib/IR/Verifier.cpp | 17 +-----
llvm/test/MachineVerifier/test_g_llround.mir | 16 ++---
llvm/test/MachineVerifier/test_g_lround.mir | 8 +--
llvm/unittests/IR/IntrinsicsTest.cpp | 64 --------------------
5 files changed, 11 insertions(+), 109 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6364ecea1897c..d22fbe322ec36 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2062,20 +2062,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
}
case TargetOpcode::G_LLROUND:
case TargetOpcode::G_LROUND: {
- LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
- LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
- if (!DstTy.isValid() || !SrcTy.isValid())
- break;
- if (SrcTy.isPointer() || DstTy.isPointer()) {
- std::string Op = SrcTy.isPointer() ? "Source" : "Destination";
- report(Twine(Op, " operand must not be a pointer type"), MI);
- } else if (SrcTy.isScalar()) {
- verifyAllRegOpsScalar(*MI, *MRI);
- break;
- } else if (SrcTy.isVector()) {
- verifyVectorElementMatch(SrcTy, DstTy, MI);
- break;
- }
+ verifyAllRegOpsScalar(*MI, *MRI);
break;
}
case TargetOpcode::G_IS_FPCLASS: {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 45dda0d7c337c..c5c407637cbf3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5971,21 +5971,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
case Intrinsic::llround: {
Type *ValTy = Call.getArgOperand(0)->getType();
Type *ResultTy = Call.getType();
- Check(
- ValTy->isFPOrFPVectorTy() && ResultTy->isIntOrIntVectorTy(),
- "llvm.lround, llvm.llround: argument must be floating-point or vector "
- "of floating-points, and result must be integer or vector of integers",
- &Call);
- Check(
- ValTy->isVectorTy() == ResultTy->isVectorTy(),
- "llvm.lround, llvm.llround: argument and result disagree on vector use",
- &Call);
- if (ValTy->isVectorTy()) {
- Check(cast<VectorType>(ValTy)->getElementCount() ==
- cast<VectorType>(ResultTy)->getElementCount(),
- "llvm.lround, llvm.llround: argument must be same length as result",
- &Call);
- }
+ Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
+ "Intrinsic does not support vectors", &Call);
break;
}
case Intrinsic::bswap: {
diff --git a/llvm/test/MachineVerifier/test_g_llround.mir b/llvm/test/MachineVerifier/test_g_llround.mir
index e69499b1150c1..9a0f4a75acaf4 100644
--- a/llvm/test/MachineVerifier/test_g_llround.mir
+++ b/llvm/test/MachineVerifier/test_g_llround.mir
@@ -14,14 +14,10 @@ body: |
%ptr:_(p0) = COPY $x0
%vector:_(<2 x s64>) = COPY $q0
- ; CHECK: Bad machine code: Source operand must not be a pointer type
- ; CHECK: instruction: %no_ptrs:_(s32) = G_LLROUND %ptr:_(p0)
- %no_ptrs:_(s32) = G_LLROUND %ptr:_(p0)
+ ; CHECK: Bad machine code: All register operands must have scalar types
+ ; CHECK: instruction: %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
+ %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
- ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
- ; CHECK: instruction: %no_vectors:_(s32) = G_LLROUND %vector:_(<2 x s64>)
- %no_vectors:_(s32) = G_LLROUND %vector:_(<2 x s64>)
-
- ; CHECK: Bad machine code: operand types must preserve number of vector elements
- ; CHECK: instruction: %inv_vectors:_(<3 x s32>) = G_LLROUND %vector:_(<2 x s64>)
- %inv_vectors:_(<3 x s32>) = G_LLROUND %vector:_(<2 x s64>)
+ ; CHECK: Bad machine code: All register operands must have scalar types
+ ; CHECK: instruction: %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
+ %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
diff --git a/llvm/test/MachineVerifier/test_g_lround.mir b/llvm/test/MachineVerifier/test_g_lround.mir
index 56f06f00049e7..69d5d4967de30 100644
--- a/llvm/test/MachineVerifier/test_g_lround.mir
+++ b/llvm/test/MachineVerifier/test_g_lround.mir
@@ -14,14 +14,10 @@ body: |
%ptr:_(p0) = COPY $x0
%vector:_(<2 x s64>) = COPY $q0
- ; CHECK: Bad machine code: Source operand must not be a pointer type
+ ; CHECK: Bad machine code: All register operands must have scalar types
; CHECK: instruction: %no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
%no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
- ; CHECK: Bad machine code: operand types must be all-vector or all-scalar
+ ; CHECK: Bad machine code: All register operands must have scalar types
; CHECK: instruction: %no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
%no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
-
- ; CHECK: Bad machine code: operand types must preserve number of vector elements
- ; CHECK: instruction: %inv_vectors:_(<3 x s32>) = G_LROUND %vector:_(<2 x s64>)
- %inv_vectors:_(<3 x s32>) = G_LROUND %vector:_(<2 x s64>)
diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 14badaa0de980..6f9e724c40326 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -12,7 +12,6 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
#include "gtest/gtest.h"
using namespace llvm;
@@ -107,67 +106,4 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
EXPECT_TRUE(Checker(*Intr));
}
}
-
-TEST(IntrinsicVerifierTest, LRound) {
- LLVMContext C;
- std::unique_ptr<Module> M = std::make_unique<Module>("M", C);
- IRBuilder<> Builder(C);
-
- using TypePair = std::pair<Type *, Type *>;
- Type *Int32Ty = Type::getInt32Ty(C);
- Type *Int64Ty = Type::getInt64Ty(C);
- Type *HalfTy = Type::getHalfTy(C);
- Type *FltTy = Type::getFloatTy(C);
- Type *DblTy = Type::getDoubleTy(C);
- auto Vec2xTy = [&](Type *ElemTy) {
- return VectorType::get(ElemTy, ElementCount::getFixed(2));
- };
- Type *Vec2xInt32Ty = Vec2xTy(Int32Ty);
- Type *Vec2xInt64Ty = Vec2xTy(Int64Ty);
- Type *Vec2xFltTy = Vec2xTy(FltTy);
-
- // Test Cases
- // Validating only a limited set of possible combinations.
- std::vector<TypePair> ValidTypes = {
- {Int32Ty, FltTy}, {Int32Ty, DblTy}, {Int64Ty, FltTy},
- {Int64Ty, DblTy}, {Int32Ty, HalfTy}, {Vec2xInt32Ty, Vec2xFltTy},
- {Vec2xInt64Ty, Vec2xFltTy}};
-
- // CreateIntrinsic errors out on invalid argument types.
- std::vector<TypePair> InvalidTypes = {
- {VectorType::get(Int32Ty, ElementCount::getFixed(3)), Vec2xFltTy}};
-
- auto testIntrinsic = [&](TypePair types, Intrinsic::ID ID, bool expectValid) {
- Function *F =
- Function::Create(FunctionType::get(types.first, {types.second}, false),
- Function::ExternalLinkage, "lround_fn", M.get());
- BasicBlock *BB = BasicBlock::Create(C, "entry", F);
- Builder.SetInsertPoint(BB);
-
- Value *Arg = F->arg_begin();
- Value *Result = Builder.CreateIntrinsic(types.first, ID, {Arg});
- Builder.CreateRet(Result);
-
- std::string Error;
- raw_string_ostream ErrorOS(Error);
- EXPECT_EQ(expectValid, !verifyFunction(*F, &ErrorOS));
- if (!expectValid) {
- EXPECT_TRUE(StringRef(ErrorOS.str())
- .contains("llvm.lround, llvm.llround: argument must be "
- "same length as result"));
- }
- };
-
- // Run Valid Cases.
- for (auto Types : ValidTypes) {
- testIntrinsic(Types, Intrinsic::lround, true);
- testIntrinsic(Types, Intrinsic::llround, true);
- }
-
- // Run Invalid Cases.
- for (auto Types : InvalidTypes) {
- testIntrinsic(Types, Intrinsic::lround, false);
- testIntrinsic(Types, Intrinsic::llround, false);
- }
-}
} // end namespace
>From d83d017d8141588bf633b8e9346285867e5e47e1 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Thu, 1 Aug 2024 12:45:25 -0500
Subject: [PATCH 09/10] Remove TTI cost computation case as per comment.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 063840636b24e..5b9cc5dfeeadb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2063,12 +2063,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
case Intrinsic::roundeven:
ISD = ISD::FROUNDEVEN;
break;
- case Intrinsic::lround:
- ISD = ISD::LROUND;
- break;
- case Intrinsic::llround:
- ISD = ISD::LLROUND;
- break;
case Intrinsic::pow:
ISD = ISD::FPOW;
break;
>From 610a83e41449f6f81b21d5902a225769c19bbc07 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Thu, 1 Aug 2024 12:52:52 -0500
Subject: [PATCH 10/10] Update variable name in lit test
---
llvm/test/CodeGen/AMDGPU/lround.ll | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 36a2280d2acd1..072ee70b840d8 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -1069,8 +1069,8 @@ define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
- %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
- ret <2 x i32> %0
+ %res = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+ ret <2 x i32> %res
}
define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
@@ -1334,6 +1334,6 @@ define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
; GFX11-GISEL-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
- %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
- ret <2 x i64> %0
+ %res = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+ ret <2 x i64> %res
}
More information about the llvm-commits
mailing list