[llvm] [AMDGPU] Implement llvm.lround intrinsic lowering. (PR #98970)

Sumanth Gundapaneni via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 23 08:31:08 PDT 2024


https://github.com/sgundapa updated https://github.com/llvm/llvm-project/pull/98970

>From 9fe46ed6acd3dcd6e61452e0753f60067682f94b Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Wed, 26 Jun 2024 10:58:20 -0500
Subject: [PATCH 1/4] [AMDGPU] Implement llvm.lround intrinsic lowering.

This patch enables the  target-independent lowering of llvm.lround via
GlobalISel. For SelectionDAG, the instrinsic is custom lowered for AMDGPU.
In order to support vector floating point input for llvm.lround, this patch
extends the target independent APIs and provide support for scalarizing.
pr98950 is needed to let verifier allow vector floating point types
---
 llvm/docs/LangRef.rst                         |   3 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   6 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  13 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  10 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   4 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   2 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  16 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   2 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   3 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  15 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   5 +
 llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll | 814 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/lround.ll            | 807 +++++++++++++++++
 15 files changed, 1692 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lround.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a04b5769f095f..1fa23fd88792f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16606,7 +16606,8 @@ Syntax:
 """""""
 
 This is an overloaded intrinsic. You can use ``llvm.lround`` on any
-floating-point type. Not all targets support all types however.
+floating-point type or vector of floating-point type. Not all targets
+support all types however.
 
 ::
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 5b9cc5dfeeadb..4cc8a5e726fa2 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2057,6 +2057,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::llrint:
       ISD = ISD::LLRINT;
       break;
+    case Intrinsic::lround:
+      ISD = ISD::LROUND;
+      break;
+    case Intrinsic::llround:
+      ISD = ISD::LLROUND;
+      break;
     case Intrinsic::round:
       ISD = ISD::FROUND;
       break;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 83df106a7fdc8..e64b5ba65cfad 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3881,6 +3881,17 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerFMad(MI);
   case TargetOpcode::G_FFLOOR:
     return lowerFFloor(MI);
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND: {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    auto Round = MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_ROUND, {SrcTy},
+                                       {SrcReg});
+    MIRBuilder.buildFPTOSI(DstReg, Round);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_INTRINSIC_ROUND:
     return lowerIntrinsicRound(MI);
   case TargetOpcode::G_FRINT: {
@@ -4741,6 +4752,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FRINT:
   case G_INTRINSIC_ROUND:
   case G_INTRINSIC_ROUNDEVEN:
+  case G_LROUND:
+  case G_LLROUND:
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 765f1e1f5f68c..14876c60e318c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -506,7 +506,7 @@ namespace {
     SDValue visitUINT_TO_FP(SDNode *N);
     SDValue visitFP_TO_SINT(SDNode *N);
     SDValue visitFP_TO_UINT(SDNode *N);
-    SDValue visitXRINT(SDNode *N);
+    SDValue visitXROUND(SDNode *N);
     SDValue visitFP_ROUND(SDNode *N);
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
@@ -1925,7 +1925,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
   case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
   case ISD::LRINT:
-  case ISD::LLRINT:             return visitXRINT(N);
+  case ISD::LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:            return visitXROUND(N);
   case ISD::FP_ROUND:           return visitFP_ROUND(N);
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
@@ -17806,15 +17808,17 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   return FoldIntToFPToInt(N, DAG);
 }
 
-SDValue DAGCombiner::visitXRINT(SDNode *N) {
+SDValue DAGCombiner::visitXROUND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (lrint|llrint undef) -> undef
+  // fold (lround|llround undef) -> undef
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   // fold (lrint|llrint c1fp) -> c1
+  // fold (lround|llround c1fp) -> c1
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 41fcc9afe4e90..7da3bfa9b0f38 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2410,7 +2410,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
     case ISD::LRINT:
-    case ISD::LLRINT:     R = PromoteFloatOp_UnaryOp(N, OpNo); break;
+    case ISD::LLRINT:
+    case ISD::LROUND:
+    case ISD::LLROUND:     R = PromoteFloatOp_UnaryOp(N, OpNo); break;
     case ISD::FP_TO_SINT_SAT:
     case ISD::FP_TO_UINT_SAT:
                           R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 7af47ed250d91..7dfc0916dd79f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1041,7 +1041,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
   SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
-  SDValue WidenVecRes_XRINT(SDNode *N);
+  SDValue WidenVecRes_XROUND(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
   SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N);
   SDValue WidenVecRes_ExpOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 307d1fc920d48..7960020c33660 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -472,6 +472,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
                                               Node->getValueType(0), Scale);
     break;
   }
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::SINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1a575abbc16f4..d75d0b1a497f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -108,6 +108,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FRINT:
   case ISD::LRINT:
   case ISD::LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
   case ISD::FSIN:
@@ -752,6 +754,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::UINT_TO_FP:
   case ISD::LRINT:
   case ISD::LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
     Res = ScalarizeVecOp_UnaryOp(N);
     break;
   case ISD::STRICT_SINT_TO_FP:
@@ -1189,6 +1193,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_LRINT:
   case ISD::LLRINT:
   case ISD::VP_LLRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::FROUND:
   case ISD::VP_FROUND:
   case ISD::FROUNDEVEN:
@@ -3172,6 +3178,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
   case ISD::FTRUNC:
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
     Res = SplitVecOp_UnaryOp(N);
@@ -4486,11 +4494,13 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_FP_TO_XINT_SAT(N);
     break;
 
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::VP_LRINT:
   case ISD::VP_LLRINT:
-    Res = WidenVecRes_XRINT(N);
+    Res = WidenVecRes_XROUND(N);
     break;
 
   case ISD::FABS:
@@ -5107,7 +5117,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_XROUND(SDNode *N) {
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   ElementCount WidenNumElts = WidenVT.getVectorElementCount();
@@ -6336,6 +6346,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VSELECT:            Res = WidenVecOp_VSELECT(N); break;
   case ISD::FLDEXP:
   case ISD::FCOPYSIGN:
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
     Res = WidenVecOp_UnrollVectorOp(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 897bdc71818f8..01dcd6612ca30 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5420,6 +5420,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
   case ISD::FRINT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::FNEARBYINT:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index eccac0e218c58..1abfe14e6c539 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1115,7 +1115,8 @@ void TargetLoweringBase::initActions() {
           {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
            ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
            ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN, ISD::FACOS,
-           ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+           ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH,
+           ISD::LROUND, ISD::LLROUND},
           VT, Expand);
 
       // Constrained floating-point operations default to expand.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 26426575aeed3..d5b9e03d69aba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -396,7 +396,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                      MVT::f32, Legal);
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
-  setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
+  setOperationAction({ISD::FROUND, ISD::LROUND, ISD::LLROUND},
+                     {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
   setOperationAction(
       {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
@@ -1386,6 +1387,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FROUNDEVEN:
     return LowerFROUNDEVEN(Op, DAG);
   case ISD::FROUND: return LowerFROUND(Op, DAG);
+  case ISD::LROUND:
+  case ISD::LLROUND:
+    return LowerLROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::FLOG2:
     return LowerFLOG2(Op, DAG);
@@ -2498,7 +2502,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
-  EVT VT = Op.getValueType();
+  EVT VT = X.getValueType();
 
   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
 
@@ -2522,6 +2526,13 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
 }
 
+SDValue AMDGPUTargetLowering::LowerLROUND(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT ResVT = Op.getValueType();
+  SDValue FRoundNode = LowerFROUND(Op, DAG);
+  return DAG.getNode(ISD::FP_TO_SINT, SL, ResVT, FRoundNode);
+}
+
 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 37572af3897f2..eb7950c1c75fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -59,6 +59,7 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
   static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 88e40da110555..4c9ab3b040d13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1131,6 +1131,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
+  getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
+      .clampScalar(0, S16, S64)
+      .scalarize(0)
+      .lower();
+
   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
       .customFor({S16, S32})
       .scalarize(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
new file mode 100644
index 0000000000000..fd4ebfdf0cc28
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
@@ -0,0 +1,814 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+
+declare float @llvm.round.f32(float)
+declare i32 @llvm.lround.i32.f32(float)
+declare i32 @llvm.lround.i32.f64(double)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare half @llvm.round.f16(half)
+declare i32 @llvm.lround.i32.f16(half %arg)
+declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+
+define float @intrinsic_fround(float %arg) {
+; GFX9-LABEL: intrinsic_fround:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call float @llvm.round.f32(float %arg)
+  ret float %0
+}
+
+define i32 @intrinsic_lround_i32_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f32(float %arg)
+  ret i32 %0
+}
+
+define i32 @intrinsic_lround_i32_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f64(double %arg)
+  ret i32 %0
+}
+
+define i64 @intrinsic_lround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f32(float %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_lround_i64_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f64(double %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_llround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_llround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_llround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_llround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f32(float %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_llround_i64_f64(double %arg) {
+; GFX9-LABEL: intrinsic_llround_i64_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_llround_i64_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_llround_i64_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f64(double %arg)
+  ret i64 %0
+}
+
+define half @intrinsic_fround_half(half %arg) {
+; GFX9-LABEL: intrinsic_fround_half:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround_half:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround_half:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call half @llvm.round.f16(half %arg)
+  ret half %0
+}
+
+define i32 @intrinsic_lround_i32_f16(half %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f16(half %arg)
+  ret i32 %0
+}
+
+define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+  ret <2 x float> %0
+}
+
+define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+  ret <2 x i32> %0
+}
+
+define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v5, |v2|, v3
+; GFX9-NEXT:    v_floor_f32_e32 v5, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v2, v5, v6, |v2|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v7
+; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v7
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v1
+; GFX9-NEXT:    v_sub_f32_e32 v8, v1, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v8
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v4
+; GFX9-NEXT:    v_mul_f32_e64 v3, |v1|, v3
+; GFX9-NEXT:    v_floor_f32_e32 v3, v3
+; GFX9-NEXT:    v_fma_f32 v1, v3, v6, |v1|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX10-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_floor_f32_e32 v4, v4
+; GFX10-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX10-NEXT:    v_floor_f32_e32 v5, v5
+; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX10-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
+; GFX11-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_floor_f32_e32 v4, v4
+; GFX11-NEXT:    v_floor_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
+; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_xor_b32_e32 v5, v0, v3
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+  ret <2 x i64> %0
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
new file mode 100644
index 0000000000000..a20dbd91fb29e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+
+declare float @llvm.round.f32(float)
+declare i32 @llvm.lround.i32.f32(float)
+declare i32 @llvm.lround.i32.f64(double)
+declare i64 @llvm.lround.i64.f32(float)
+declare i64 @llvm.lround.i64.f64(double)
+declare i64 @llvm.llround.i64.f32(float)
+declare half @llvm.round.f16(half)
+declare i32 @llvm.lround.i32.f16(half %arg)
+declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+
+define float @intrinsic_fround(float %arg) {
+; GFX9-LABEL: intrinsic_fround:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call float @llvm.round.f32(float %arg)
+  ret float %0
+}
+
+define i32 @intrinsic_lround_i32_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f32(float %arg)
+  ret i32 %0
+}
+
+define i32 @intrinsic_lround_i32_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f64(double %arg)
+  ret i32 %0
+}
+
+define i64 @intrinsic_lround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f32(float %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_lround_i64_f64(double %arg) {
+; GFX9-LABEL: intrinsic_lround_i64_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i64_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i64_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.lround.i64.f64(double %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_llround_i64_f32(float %arg) {
+; GFX9-LABEL: intrinsic_llround_i64_f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_llround_i64_f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_llround_i64_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f32(float %arg)
+  ret i64 %0
+}
+
+define i64 @intrinsic_llround_i64_f64(double %arg) {
+; GFX9-LABEL: intrinsic_llround_i64_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_llround_i64_f64:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_llround_i64_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f64(double %arg)
+  ret i64 %0
+}
+
+define half @intrinsic_fround_half(half %arg) {
+; GFX9-LABEL: intrinsic_fround_half:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround_half:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround_half:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call half @llvm.round.f16(half %arg)
+  ret half %0
+}
+
+define i32 @intrinsic_lround_i32_f16(half %arg) {
+; GFX9-LABEL: intrinsic_lround_i32_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_i32_f16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_i32_f16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f16(half %arg)
+  ret i32 %0
+}
+
+define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s6, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfi_b32 v1, s6, v3, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_fround_v2f32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
+  ret <2 x float> %0
+}
+
+define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s6, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfi_b32 v1, s6, v3, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
+  ret <2 x i32> %0
+}
+
+define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
+; GFX9-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
+; GFX9-NEXT:    s_brev_b32 s6, -2
+; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0x2f800000
+; GFX9-NEXT:    v_mul_f32_e64 v2, |v0|, s7
+; GFX9-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-NEXT:    s_mov_b32 s8, 0xcf800000
+; GFX9-NEXT:    v_fma_f32 v3, v2, s8, |v0|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_xor_b32_e32 v0, v3, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX9-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX9-NEXT:    v_bfi_b32 v1, s6, v5, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX9-NEXT:    v_mul_f32_e64 v1, |v3|, s7
+; GFX9-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-NEXT:    v_fma_f32 v5, v1, s8, |v3|
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v4, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
+; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
+; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX10-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX10-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX10-NEXT:    v_floor_f32_e32 v3, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX10-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
+; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
+; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
+; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; GFX11-NEXT:    v_floor_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX11-NEXT:    v_xor_b32_e32 v4, v0, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
+  ret <2 x i64> %0
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

>From 24c2520287010efd1bb6d79ec935f1587a7cc083 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 16 Jul 2024 14:38:19 -0500
Subject: [PATCH 2/4] Implement llvm.lround lowering for GlobalISel and
 SelectionDAG.

Also, removed handling of vector types. Support will be pushed in a later patch
---
 llvm/docs/LangRef.rst                         |   3 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   6 -
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   2 -
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  10 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  10 +
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   4 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   2 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   2 -
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  16 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   2 -
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  21 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  17 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll | 263 ------------------
 llvm/test/CodeGen/AMDGPU/lround.ll            | 263 ------------------
 15 files changed, 34 insertions(+), 588 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1fa23fd88792f..a04b5769f095f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16606,8 +16606,7 @@ Syntax:
 """""""
 
 This is an overloaded intrinsic. You can use ``llvm.lround`` on any
-floating-point type or vector of floating-point type. Not all targets
-support all types however.
+floating-point type. Not all targets support all types however.
 
 ::
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4cc8a5e726fa2..5b9cc5dfeeadb 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2057,12 +2057,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::llrint:
       ISD = ISD::LLRINT;
       break;
-    case Intrinsic::lround:
-      ISD = ISD::LROUND;
-      break;
-    case Intrinsic::llround:
-      ISD = ISD::LLROUND;
-      break;
     case Intrinsic::round:
       ISD = ISD::FROUND;
       break;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e64b5ba65cfad..8cc3b1c4a1a86 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4752,8 +4752,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FRINT:
   case G_INTRINSIC_ROUND:
   case G_INTRINSIC_ROUNDEVEN:
-  case G_LROUND:
-  case G_LLROUND:
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 14876c60e318c..765f1e1f5f68c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -506,7 +506,7 @@ namespace {
     SDValue visitUINT_TO_FP(SDNode *N);
     SDValue visitFP_TO_SINT(SDNode *N);
     SDValue visitFP_TO_UINT(SDNode *N);
-    SDValue visitXROUND(SDNode *N);
+    SDValue visitXRINT(SDNode *N);
     SDValue visitFP_ROUND(SDNode *N);
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
@@ -1925,9 +1925,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
   case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
   case ISD::LRINT:
-  case ISD::LLRINT:
-  case ISD::LROUND:
-  case ISD::LLROUND:            return visitXROUND(N);
+  case ISD::LLRINT:             return visitXRINT(N);
   case ISD::FP_ROUND:           return visitFP_ROUND(N);
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
@@ -17808,17 +17806,15 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   return FoldIntToFPToInt(N, DAG);
 }
 
-SDValue DAGCombiner::visitXROUND(SDNode *N) {
+SDValue DAGCombiner::visitXRINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (lrint|llrint undef) -> undef
-  // fold (lround|llround undef) -> undef
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   // fold (lrint|llrint c1fp) -> c1
-  // fold (lround|llround c1fp) -> c1
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 9f515739ee048..cef5d765c11de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3439,6 +3439,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::FP_TO_UINT_SAT:
     Results.push_back(TLI.expandFP_TO_INT_SAT(Node, DAG));
     break;
+  case ISD::LROUND:
+  case ISD::LLROUND: {
+    SDValue Arg = Node->getOperand(0);
+    EVT ArgVT = Arg.getValueType();
+    EVT ResVT = Node->getValueType(0);
+    SDLoc dl(Node);
+    SDValue RoundNode = DAG.getNode(ISD::FROUND, dl, ArgVT, Arg);
+    Results.push_back(DAG.getNode(ISD::FP_TO_SINT, dl, ResVT, RoundNode));
+    break;
+  }
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 7da3bfa9b0f38..41fcc9afe4e90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2410,9 +2410,7 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
     case ISD::LRINT:
-    case ISD::LLRINT:
-    case ISD::LROUND:
-    case ISD::LLROUND:     R = PromoteFloatOp_UnaryOp(N, OpNo); break;
+    case ISD::LLRINT:     R = PromoteFloatOp_UnaryOp(N, OpNo); break;
     case ISD::FP_TO_SINT_SAT:
     case ISD::FP_TO_UINT_SAT:
                           R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 7dfc0916dd79f..7af47ed250d91 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1041,7 +1041,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
   SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
-  SDValue WidenVecRes_XROUND(SDNode *N);
+  SDValue WidenVecRes_XRINT(SDNode *N);
   SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
   SDValue WidenVecRes_UnarySameEltsWithScalarArg(SDNode *N);
   SDValue WidenVecRes_ExpOp(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 7960020c33660..307d1fc920d48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -472,8 +472,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
                                               Node->getValueType(0), Scale);
     break;
   }
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::SINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d75d0b1a497f1..1a575abbc16f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -108,8 +108,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FRINT:
   case ISD::LRINT:
   case ISD::LLRINT:
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
   case ISD::FSIN:
@@ -754,8 +752,6 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::UINT_TO_FP:
   case ISD::LRINT:
   case ISD::LLRINT:
-  case ISD::LROUND:
-  case ISD::LLROUND:
     Res = ScalarizeVecOp_UnaryOp(N);
     break;
   case ISD::STRICT_SINT_TO_FP:
@@ -1193,8 +1189,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_LRINT:
   case ISD::LLRINT:
   case ISD::VP_LLRINT:
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::FROUND:
   case ISD::VP_FROUND:
   case ISD::FROUNDEVEN:
@@ -3178,8 +3172,6 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
   case ISD::FTRUNC:
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
     Res = SplitVecOp_UnaryOp(N);
@@ -4494,13 +4486,11 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_FP_TO_XINT_SAT(N);
     break;
 
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::VP_LRINT:
   case ISD::VP_LLRINT:
-    Res = WidenVecRes_XROUND(N);
+    Res = WidenVecRes_XRINT(N);
     break;
 
   case ISD::FABS:
@@ -5117,7 +5107,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_XROUND(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) {
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   ElementCount WidenNumElts = WidenVT.getVectorElementCount();
@@ -6346,8 +6336,6 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VSELECT:            Res = WidenVecOp_VSELECT(N); break;
   case ISD::FLDEXP:
   case ISD::FCOPYSIGN:
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
     Res = WidenVecOp_UnrollVectorOp(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 01dcd6612ca30..897bdc71818f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5420,8 +5420,6 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
   case ISD::FRINT:
-  case ISD::LROUND:
-  case ISD::LLROUND:
   case ISD::LRINT:
   case ISD::LLRINT:
   case ISD::FNEARBYINT:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1abfe14e6c539..bf569ac433d71 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1115,8 +1115,7 @@ void TargetLoweringBase::initActions() {
           {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
            ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
            ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN, ISD::FACOS,
-           ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH,
-           ISD::LROUND, ISD::LLROUND},
+           ISD::FASIN, ISD::FATAN, ISD::FCOSH, ISD::FSINH, ISD::FTANH},
           VT, Expand);
 
       // Constrained floating-point operations default to expand.
@@ -1168,13 +1167,17 @@ void TargetLoweringBase::initActions() {
                      Expand);
 
   // These library functions default to expand.
-  setOperationAction({ISD::FCBRT,      ISD::FLOG,    ISD::FLOG2,  ISD::FLOG10,
-                      ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10, ISD::FFLOOR,
-                      ISD::FNEARBYINT, ISD::FCEIL,   ISD::FRINT,  ISD::FTRUNC,
-                      ISD::LROUND,     ISD::LLROUND, ISD::LRINT,  ISD::LLRINT,
-                      ISD::FROUNDEVEN, ISD::FTAN,    ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH,   ISD::FSINH,  ISD::FTANH},
-                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
+  setOperationAction(
+      {ISD::FCBRT, ISD::FLOG,   ISD::FLOG2,  ISD::FLOG10,     ISD::FEXP,
+       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
+       ISD::FRINT, ISD::FTRUNC, ISD::LRINT,  ISD::LLRINT,     ISD::FROUNDEVEN,
+       ISD::FTAN,  ISD::FACOS,  ISD::FASIN,  ISD::FATAN,      ISD::FCOSH,
+       ISD::FSINH, ISD::FTANH},
+      {MVT::f32, MVT::f64, MVT::f128}, Expand);
+
+  // Unless the target expands, default LROUND to LibCall.
+  setOperationAction({ISD::LROUND, ISD::LLROUND},
+                     {MVT::f32, MVT::f64, MVT::f128}, LibCall);
 
   setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH,
                       ISD::FSINH, ISD::FTANH},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d5b9e03d69aba..405621cb729d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -396,8 +396,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                      MVT::f32, Legal);
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
-  setOperationAction({ISD::FROUND, ISD::LROUND, ISD::LLROUND},
-                     {MVT::f16, MVT::f32, MVT::f64}, Custom);
+  setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
+  setOperationAction({ISD::LROUND, ISD::LLROUND},
+                     {MVT::f16, MVT::f32, MVT::f64}, Expand);
 
   setOperationAction(
       {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
@@ -1387,9 +1388,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FROUNDEVEN:
     return LowerFROUNDEVEN(Op, DAG);
   case ISD::FROUND: return LowerFROUND(Op, DAG);
-  case ISD::LROUND:
-  case ISD::LLROUND:
-    return LowerLROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::FLOG2:
     return LowerFLOG2(Op, DAG);
@@ -2502,7 +2500,7 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
-  EVT VT = X.getValueType();
+  EVT VT = Op.getValueType();
 
   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
 
@@ -2526,13 +2524,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
 }
 
-SDValue AMDGPUTargetLowering::LowerLROUND(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  EVT ResVT = Op.getValueType();
-  SDValue FRoundNode = LowerFROUND(Op, DAG);
-  return DAG.getNode(ISD::FP_TO_SINT, SL, ResVT, FRoundNode);
-}
-
 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index eb7950c1c75fc..37572af3897f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -59,7 +59,6 @@ class AMDGPUTargetLowering : public TargetLowering {
 
   SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
   static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
index fd4ebfdf0cc28..7fb28524d1d2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
@@ -11,9 +11,6 @@ declare i64 @llvm.lround.i64.f64(double)
 declare i64 @llvm.llround.i64.f32(float)
 declare half @llvm.round.f16(half)
 declare i32 @llvm.lround.i32.f16(half %arg)
-declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
-declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
-declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
 
 define float @intrinsic_fround(float %arg) {
 ; GFX9-LABEL: intrinsic_fround:
@@ -550,265 +547,5 @@ entry:
   ret i32 %0
 }
 
-define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
-  ret <2 x float> %0
-}
-
-define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
-  ret <2 x i32> %0
-}
-
-define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v5, |v2|, v3
-; GFX9-NEXT:    v_floor_f32_e32 v5, v5
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v2, v5, v6, |v2|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v7
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v1
-; GFX9-NEXT:    v_sub_f32_e32 v8, v1, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v8
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v4
-; GFX9-NEXT:    v_mul_f32_e64 v3, |v1|, v3
-; GFX9-NEXT:    v_floor_f32_e32 v3, v3
-; GFX9-NEXT:    v_fma_f32 v1, v3, v6, |v1|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v7, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v4
-; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
-; GFX10-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_floor_f32_e32 v4, v4
-; GFX10-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
-; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
-; GFX10-NEXT:    v_floor_f32_e32 v5, v5
-; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v5
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v3
-; GFX10-NEXT:    v_xor_b32_e32 v5, v0, v3
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v5
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e64 v4, 0x2f800000, |v2|
-; GFX11-NEXT:    v_mul_f32_e64 v5, 0x2f800000, |v3|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_floor_f32_e32 v4, v4
-; GFX11-NEXT:    v_floor_f32_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v4, |v2|
-; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v5, |v3|
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v4, v5
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_xor_b32_e32 v4, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_xor_b32_e32 v5, v0, v3
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v6, vcc_lo
-; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v5, v3
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v4, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
-  ret <2 x i64> %0
-}
-
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index a20dbd91fb29e..dfda7e8e81f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -11,9 +11,6 @@ declare i64 @llvm.lround.i64.f64(double)
 declare i64 @llvm.llround.i64.f32(float)
 declare half @llvm.round.f16(half)
 declare i32 @llvm.lround.i32.f16(half %arg)
-declare <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
-declare <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
-declare <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
 
 define float @intrinsic_fround(float %arg) {
 ; GFX9-LABEL: intrinsic_fround:
@@ -543,265 +540,5 @@ entry:
   %0 = tail call i32 @llvm.lround.i32.f16(half %arg)
   ret i32 %0
 }
-
-define <2 x float> @intrinsic_fround_v2f32_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s6, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfi_b32 v1, s6, v3, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround_v2f32_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x float> @llvm.round.v2f32.v2f32(<2 x float> %arg)
-  ret <2 x float> %0
-}
-
-define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s6, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
-; GFX9-NEXT:    v_sub_f32_e32 v3, v1, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfi_b32 v1, s6, v3, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_v2i32_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x i32> @llvm.lround.v2i32.v2f32(<2 x float> %arg)
-  ret <2 x i32> %0
-}
-
-define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
-; GFX9-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v0, v2
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s6, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s6, v3, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s7, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v2, |v0|, s7
-; GFX9-NEXT:    v_floor_f32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s8, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v3, v2, s8, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, v3, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX9-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfi_b32 v1, s6, v5, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX9-NEXT:    v_mul_f32_e64 v1, |v3|, s7
-; GFX9-NEXT:    v_floor_f32_e32 v1, v1
-; GFX9-NEXT:    v_fma_f32 v5, v1, s8, |v3|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, v5, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v5, v1, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v4|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v5|, 0.5
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s4
-; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GFX10-NEXT:    v_floor_f32_e32 v2, v2
-; GFX10-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GFX10-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
-; GFX10-NEXT:    v_floor_f32_e32 v3, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX10-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GFX10-NEXT:    v_xor_b32_e32 v4, v0, v6
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_v2i64_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v2, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v4, v0, v2 :: v_dual_sub_f32 v5, v1, v3
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1.0, s0
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v5|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v4, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v5, v1
-; GFX11-NEXT:    v_dual_add_f32 v0, v2, v0 :: v_dual_add_f32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v0|
-; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
-; GFX11-NEXT:    v_mul_f32_e64 v3, 0x2f800000, |v1|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_floor_f32_e32 v2, v2
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GFX11-NEXT:    v_floor_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fma_f32 v4, 0xcf800000, v2, |v0|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX11-NEXT:    v_fma_f32 v0, 0xcf800000, v3, |v1|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_xor_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_xor_b32_e32 v4, v0, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v1, v5
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v2, v5, vcc_lo
-; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v4, v6
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call <2 x i64> @llvm.lround.v2i64.v2f32(<2 x float> %arg)
-  ret <2 x i64> %0
-}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}

>From 4d5ccd9f9f96d02973bc005497f63c36012162f4 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 16 Jul 2024 17:32:13 -0500
Subject: [PATCH 3/4] Update X64 rule for llvm.lround

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1d947ac2346d0..5d3a998c3e83e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -849,8 +849,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
-    setOperationAction(ISD::LROUND, MVT::f80, Expand);
-    setOperationAction(ISD::LLROUND, MVT::f80, Expand);
+    setOperationAction(ISD::LROUND, MVT::f80, LibCall);
+    setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
     setOperationAction(ISD::LRINT, MVT::f80, Custom);
     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 

>From 47b184d5b07c28e84a55e223c63aebf629baf151 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 23 Jul 2024 10:30:37 -0500
Subject: [PATCH 4/4] Address comments

---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    1 -
 llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll |  551 ------
 llvm/test/CodeGen/AMDGPU/lround.ll            | 1490 +++++++++++------
 3 files changed, 991 insertions(+), 1051 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index bf569ac433d71..c6c69fec64bcd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1175,7 +1175,6 @@ void TargetLoweringBase::initActions() {
        ISD::FSINH, ISD::FTANH},
       {MVT::f32, MVT::f64, MVT::f128}, Expand);
 
-  // Unless the target expands, default LROUND to LibCall.
   setOperationAction({ISD::LROUND, ISD::LLROUND},
                      {MVT::f32, MVT::f64, MVT::f128}, LibCall);
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
deleted file mode 100644
index 7fb28524d1d2d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lround.ll
+++ /dev/null
@@ -1,551 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-
-declare float @llvm.round.f32(float)
-declare i32 @llvm.lround.i32.f32(float)
-declare i32 @llvm.lround.i32.f64(double)
-declare i64 @llvm.lround.i64.f32(float)
-declare i64 @llvm.lround.i64.f64(double)
-declare i64 @llvm.llround.i64.f32(float)
-declare half @llvm.round.f16(half)
-declare i32 @llvm.lround.i32.f16(half %arg)
-
-define float @intrinsic_fround(float %arg) {
-; GFX9-LABEL: intrinsic_fround:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call float @llvm.round.f32(float %arg)
-  ret float %0
-}
-
-define i32 @intrinsic_lround_i32_f32(float %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f32(float %arg)
-  ret i32 %0
-}
-
-define i32 @intrinsic_lround_i32_f64(double %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-NEXT:    s_brev_b32 s4, 1
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f64(double %arg)
-  ret i32 %0
-}
-
-define i64 @intrinsic_lround_i64_f32(float %arg) {
-; GFX9-LABEL: intrinsic_lround_i64_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
-; GFX9-NEXT:    v_floor_f32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i64_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX10-NEXT:    v_floor_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i64_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX11-NEXT:    v_floor_f32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i64 @llvm.lround.i64.f32(float %arg)
-  ret i64 %0
-}
-
-define i64 @intrinsic_lround_i64_f64(double %arg) {
-; GFX9-LABEL: intrinsic_lround_i64_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-NEXT:    s_brev_b32 s4, 1
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3df00000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
-; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX9-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i64_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX10-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
-; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i64_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
-; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i64 @llvm.lround.i64.f64(double %arg)
-  ret i64 %0
-}
-
-define i64 @intrinsic_llround_i64_f32(float %arg) {
-; GFX9-LABEL: intrinsic_llround_i64_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v2, |v1|, v2
-; GFX9-NEXT:    v_floor_f32_e32 v2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v1, v2, v3, |v1|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_llround_i64_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX10-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX10-NEXT:    v_floor_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_llround_i64_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
-; GFX11-NEXT:    v_floor_f32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v1
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i64 @llvm.llround.i64.f32(float %arg)
-  ret i64 %0
-}
-
-define i64 @intrinsic_llround_i64_f64(double %arg) {
-; GFX9-LABEL: intrinsic_llround_i64_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-NEXT:    s_brev_b32 s4, 1
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v4
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3df00000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
-; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX9-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
-; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
-; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_llround_i64_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX10-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
-; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_llround_i64_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0, 0
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
-; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i64 @llvm.llround.i64.f64(double %arg)
-  ret i64 %0
-}
-
-define half @intrinsic_fround_half(half %arg) {
-; GFX9-LABEL: intrinsic_fround_half:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround_half:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround_half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call half @llvm.round.f16(half %arg)
-  ret half %0
-}
-
-define i32 @intrinsic_lround_i32_f16(half %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %0 = tail call i32 @llvm.lround.i32.f16(half %arg)
-  ret i32 %0
-}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index dfda7e8e81f9a..d45d83026013d 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -1,7 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
 
 declare float @llvm.round.f32(float)
 declare i32 @llvm.lround.i32.f32(float)
@@ -13,532 +16,1021 @@ declare half @llvm.round.f16(half)
 declare i32 @llvm.lround.i32.f16(half %arg)
 
 define float @intrinsic_fround(float %arg) {
-; GFX9-LABEL: intrinsic_fround:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_fround:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_fround:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_fround:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_fround:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_fround:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_fround:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call float @llvm.round.f32(float %arg)
-  ret float %0
+  %res = tail call float @llvm.round.f32(float %arg)
+  ret float %res
 }
 
 define i32 @intrinsic_lround_i32_f32(float %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i32 @llvm.lround.i32.f32(float %arg)
-  ret i32 %0
+  %res = tail call i32 @llvm.lround.i32.f32(float %arg)
+  ret i32 %res
 }
 
 define i32 @intrinsic_lround_i32_f64(double %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v0, v1
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i32 @llvm.lround.i32.f64(double %arg)
-  ret i32 %0
+  %res = tail call i32 @llvm.lround.i32.f64(double %arg)
+  ret i32 %res
 }
 
 define i64 @intrinsic_lround_i64_f32(float %arg) {
-; GFX9-LABEL: intrinsic_lround_i64_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i64_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX10-NEXT:    v_floor_f32_e32 v1, v1
-; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i64_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT:    v_floor_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i64 @llvm.lround.i64.f32(float %arg)
-  ret i64 %0
+  %res = tail call i64 @llvm.lround.i64.f32(float %arg)
+  ret i64 %res
 }
 
 define i64 @intrinsic_lround_i64_f64(double %arg) {
-; GFX9-LABEL: intrinsic_lround_i64_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v1
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
-; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
-; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
-; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i64_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
-; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i64_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
-; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i64 @llvm.lround.i64.f64(double %arg)
-  ret i64 %0
+  %res = tail call i64 @llvm.lround.i64.f64(double %arg)
+  ret i64 %res
 }
 
 define i64 @intrinsic_llround_i64_f32(float %arg) {
-; GFX9-LABEL: intrinsic_llround_i64_f32:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x2f800000
-; GFX9-NEXT:    v_mul_f32_e64 v1, |v0|, s4
-; GFX9-NEXT:    v_floor_f32_e32 v1, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0xcf800000
-; GFX9-NEXT:    v_fma_f32 v2, v1, s4, |v0|
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_llround_i64_f32:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX10-NEXT:    v_floor_f32_e32 v1, v1
-; GFX10-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v2
-; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_llround_i64_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
-; GFX11-NEXT:    v_floor_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x2f800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
+; GFX9-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX9-SDAG-NEXT:    v_fma_f32 v2, v1, s4, |v0|
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX9-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX9-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX9-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX9-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX10-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX10-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX10-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX10-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX10-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX10-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX10-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX10-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_mul_f32_e64 v1, 0x2f800000, |v0|
+; GFX11-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_fma_f32 v2, 0xcf800000, v1, |v0|
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-SDAG-NEXT:    v_cvt_u32_f32_e32 v0, v2
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-SDAG-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, 0x80000000, v0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX11-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f32_e64 v2, 0x2f800000, |v1|
+; GFX11-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f32 v1, 0xcf800000, v2, |v1|
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v1
+; GFX11-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v3
+; GFX11-GISEL-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i64 @llvm.llround.i64.f32(float %arg)
-  ret i64 %0
+  %res = tail call i64 @llvm.llround.i64.f32(float %arg)
+  ret i64 %res
 }
 
 define i64 @intrinsic_llround_i64_f64(double %arg) {
-; GFX9-LABEL: intrinsic_llround_i64_f64:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
-; GFX9-NEXT:    s_brev_b32 s4, -2
-; GFX9-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v4, v1
-; GFX9-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xffe0
-; GFX9-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc1f00000
-; GFX9-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
-; GFX9-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_llround_i64_f64:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX10-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
-; GFX10-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
-; GFX10-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX10-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX10-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
-; GFX10-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX10-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX10-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_llround_i64_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
-; GFX11-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
-; GFX11-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-SDAG-NEXT:    s_brev_b32 s4, -2
+; GFX9-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-SDAG-NEXT:    v_bfi_b32 v1, s4, v4, v1
+; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
+; GFX9-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc1f00000
+; GFX9-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[2:3], s[4:5], v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX9-GISEL-NEXT:    s_brev_b32 s4, 1
+; GFX9-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX9-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX9-GISEL-NEXT:    v_and_or_b32 v1, v1, s4, v4
+; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3df00000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc1f00000
+; GFX9-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[2:3], v[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX10-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX10-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4
+; GFX10-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX10-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX10-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v1, 0x7fffffff, v4, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], 0xffffffe0
+; GFX11-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX11-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0
+; GFX11-GISEL-NEXT:    v_and_or_b32 v1, 0x80000000, v1, v4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x3df00000, v[0:1]
+; GFX11-GISEL-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], 0xc1f00000, v[2:3], v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX11-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i64 @llvm.llround.i64.f64(double %arg)
-  ret i64 %0
+  %res = tail call i64 @llvm.llround.i64.f64(double %arg)
+  ret i64 %res
 }
 
 define half @intrinsic_fround_half(half %arg) {
-; GFX9-LABEL: intrinsic_fround_half:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_fround_half:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_fround_half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_fround_half:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_fround_half:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_fround_half:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_fround_half:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_fround_half:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_fround_half:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call half @llvm.round.f16(half %arg)
-  ret half %0
+  %res = tail call half @llvm.round.f16(half %arg)
+  ret half %res
 }
 
 define i32 @intrinsic_lround_i32_f16(half %arg) {
-; GFX9-LABEL: intrinsic_lround_i32_f16:
-; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, v2, v0
-; GFX9-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: intrinsic_lround_i32_f16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX10-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX10-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: intrinsic_lround_i32_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX9-SDAG:       ; %bb.0: ; %entry
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-SDAG-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX9-GISEL:       ; %bb.0: ; %entry
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX9-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX9-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX10-SDAG:       ; %bb.0: ; %entry
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-SDAG-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX10-GISEL:       ; %bb.0: ; %entry
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX10-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX10-GISEL-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX11-SDAG:       ; %bb.0: ; %entry
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
-  %0 = tail call i32 @llvm.lround.i32.f16(half %arg)
-  ret i32 %0
+  %res = tail call i32 @llvm.lround.i32.f16(half %arg)
+  ret i32 %res
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
+



More information about the llvm-commits mailing list