[llvm] 0da582d - GlobalISel: Handle llvm.roundeven

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 29 17:01:21 PDT 2020


Author: Matt Arsenault
Date: 2020-07-29T20:01:12-04:00
New Revision: 0da582d9b6cc8a96682628ff90b5e0e5d155358c

URL: https://github.com/llvm/llvm-project/commit/0da582d9b6cc8a96682628ff90b5e0e5d155358c
DIFF: https://github.com/llvm/llvm-project/commit/0da582d9b6cc8a96682628ff90b5e0e5d155358c.diff

LOG: GlobalISel: Handle llvm.roundeven

I still think it's highly questionable that we have two intrinsics
with identical behavior and only vary by the name of the libcall used
if it happens to be lowered that way, but try to reduce the feature
delta between SDAG and GlobalISel for recently added intrinsics. I'm
not sure which opcode should be considered the canonical one, but
lower roundeven back to round.

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
    llvm/test/CodeGen/X86/GlobalISel/roundeven.ll

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
    llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
    llvm/include/llvm/Support/TargetOpcodes.def
    llvm/include/llvm/Target/GenericOpcodes.td
    llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
    llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/X86/X86LegalizerInfo.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index d5f5ebe2a081..d925c53a5750 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -228,6 +228,8 @@ class LegalizerHelper {
                          ArrayRef<Register> Src1Regs,
                          ArrayRef<Register> Src2Regs, LLT NarrowTy);
 
+  void changeOpcode(MachineInstr &MI, unsigned NewOpcode);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.

diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 7123aff36cd1..57cc693e1817 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -667,6 +667,15 @@ class LegalizeRuleSet {
                                      Types2);
   }
 
+  /// The instruction is emitted as a library call.
+  LegalizeRuleSet &libcall() {
+    using namespace LegalizeMutations;
+    // We have no choice but conservatively assume that predicate-less lowering
+    // properly handles all type indices by design:
+    markAllIdxsAsCovered();
+    return actionIf(LegalizeAction::Libcall, always);
+  }
+
   /// Like legalIf, but for the Libcall action.
   LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that a libcall with a

diff  --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 9470b1d52bba..572da717bc0b 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -297,6 +297,9 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUND)
 /// INTRINSIC round to integer intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT)
 
+/// INTRINSIC roundeven intrinsic.
+HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN)
+
 /// INTRINSIC readcyclecounter
 HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER)
 

diff  --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 469ac13cca49..841a314ecc90 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -918,6 +918,12 @@ def G_INTRINSIC_LRINT : GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_INTRINSIC_ROUNDEVEN : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
 def G_READCYCLECOUNTER : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins);

diff  --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6338f276b85d..9adf3d1a89a3 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1280,6 +1280,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_FRINT;
     case Intrinsic::round:
       return TargetOpcode::G_INTRINSIC_ROUND;
+    case Intrinsic::roundeven:
+      return TargetOpcode::G_INTRINSIC_ROUNDEVEN;
     case Intrinsic::sin:
       return TargetOpcode::G_FSIN;
     case Intrinsic::sqrt:

diff  --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 7609adfc7c65..abb983dac6bd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -77,6 +77,8 @@ static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
     return Type::getFloatTy(Ctx);
   case 64:
     return Type::getDoubleTy(Ctx);
+  case 80:
+    return Type::getX86_FP80Ty(Ctx);
   case 128:
     return Type::getFP128Ty(Ctx);
   default:
@@ -386,7 +388,7 @@ void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
 }
 
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
-#define RTLIBCASE(LibcallPrefix)                                               \
+#define RTLIBCASE_INT(LibcallPrefix)                                           \
   do {                                                                         \
     switch (Size) {                                                            \
     case 32:                                                                   \
@@ -400,19 +402,33 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     }                                                                          \
   } while (0)
 
-  assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+#define RTLIBCASE(LibcallPrefix)                                               \
+  do {                                                                         \
+    switch (Size) {                                                            \
+    case 32:                                                                   \
+      return RTLIB::LibcallPrefix##32;                                         \
+    case 64:                                                                   \
+      return RTLIB::LibcallPrefix##64;                                         \
+    case 80:                                                                   \
+      return RTLIB::LibcallPrefix##80;                                         \
+    case 128:                                                                  \
+      return RTLIB::LibcallPrefix##128;                                        \
+    default:                                                                   \
+      llvm_unreachable("unexpected size");                                     \
+    }                                                                          \
+  } while (0)
 
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    RTLIBCASE(SDIV_I);
+    RTLIBCASE_INT(SDIV_I);
   case TargetOpcode::G_UDIV:
-    RTLIBCASE(UDIV_I);
+    RTLIBCASE_INT(UDIV_I);
   case TargetOpcode::G_SREM:
-    RTLIBCASE(SREM_I);
+    RTLIBCASE_INT(SREM_I);
   case TargetOpcode::G_UREM:
-    RTLIBCASE(UREM_I);
+    RTLIBCASE_INT(UREM_I);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
-    RTLIBCASE(CTLZ_I);
+    RTLIBCASE_INT(CTLZ_I);
   case TargetOpcode::G_FADD:
     RTLIBCASE(ADD_F);
   case TargetOpcode::G_FSUB:
@@ -455,6 +471,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(RINT_F);
   case TargetOpcode::G_FNEARBYINT:
     RTLIBCASE(NEARBYINT_F);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+    RTLIBCASE(ROUNDEVEN_F);
   }
   llvm_unreachable("Unknown libcall function");
 }
@@ -670,10 +688,11 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FSQRT:
   case TargetOpcode::G_FRINT:
-  case TargetOpcode::G_FNEARBYINT: {
+  case TargetOpcode::G_FNEARBYINT:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
-    if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) {
-      LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n");
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
       return UnableToLegalize;
     }
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
@@ -2163,6 +2182,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FPOW:
   case TargetOpcode::G_INTRINSIC_TRUNC:
   case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
     assert(TypeIdx == 0);
     Observer.changingInstr(MI);
 
@@ -2363,6 +2383,13 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
   }
 }
 
+// Legalize an instruction by changing the opcode in place.
+void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
+    Observer.changedInstr(MI);
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   using namespace TargetOpcode;
@@ -2461,6 +2488,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerFFloor(MI);
   case TargetOpcode::G_INTRINSIC_ROUND:
     return lowerIntrinsicRound(MI);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
+    // Since round even is the assumed rounding mode for unconstrained FP
+    // operations, rint and roundeven are the same operation.
+    changeOpcode(MI, TargetOpcode::G_FRINT);
+    return Legalized;
+  }
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     Register OldValRes = MI.getOperand(0).getReg();
     Register SuccessRes = MI.getOperand(1).getReg();
@@ -3557,6 +3590,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FFLOOR:
   case G_FRINT:
   case G_INTRINSIC_ROUND:
+  case G_INTRINSIC_ROUNDEVEN:
   case G_INTRINSIC_TRUNC:
   case G_FCOS:
   case G_FSIN:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bac4f9ab5c25..4320151d5758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -711,7 +711,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
        .scalarize(0)
        .lower();
 
-  getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+  // Lower roundeven into G_FRINT
+  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
     .scalarize(0)
     .lower();
 

diff  --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 84f560f2f9ee..96c9a8580f8f 100644
--- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -70,6 +70,11 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   setLegalizerInfoAVX512DQ();
   setLegalizerInfoAVX512BW();
 
+  getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN)
+    .scalarize(0)
+    .minScalar(0, LLT::scalar(32))
+    .libcall();
+
   setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
   for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
     setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 08e416fd6316..705f7cd1a058 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -132,6 +132,9 @@
 # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 
 # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
new file mode 100644
index 000000000000..09eda4be6197
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll
@@ -0,0 +1,566 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define float @v_roundeven_f32(float %x) {
+; GFX6-LABEL: v_roundeven_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call float @llvm.roundeven.f32(float %x)
+  ret float %roundeven
+}
+
+define <2 x float> @v_roundeven_v2f32(<2 x float> %x) {
+; GFX6-LABEL: v_roundeven_v2f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x)
+  ret <2 x float> %roundeven
+}
+
+define <3 x float> @v_roundeven_v3f32(<3 x float> %x) {
+; GFX6-LABEL: v_roundeven_v3f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v3f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX8-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x)
+  ret <3 x float> %roundeven
+}
+
+define <4 x float> @v_roundeven_v4f32(<4 x float> %x) {
+; GFX6-LABEL: v_roundeven_v4f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX6-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v4f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX7-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX8-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX8-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX9-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
+  ret <4 x float> %roundeven
+}
+
+define half @v_roundeven_f16(half %x) {
+; GFX6-LABEL: v_roundeven_f16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call half @llvm.roundeven.f16(half %x)
+  ret half %roundeven
+}
+
+define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
+; GFX6-LABEL: v_roundeven_v2f16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f16_e32 v1, v0
+; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x)
+  ret <2 x half> %roundeven
+}
+
+define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
+; GFX6-LABEL: v_roundeven_v2f16_fneg:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v2f16_fneg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v2f16_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX8-NEXT:    v_rndne_f16_e32 v1, v0
+; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 16
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v2f16_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %x.fneg = fneg <2 x half> %x
+  %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg)
+  ret <2 x half> %roundeven
+}
+
+define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
+; GFX6-LABEL: v_roundeven_v4f16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX6-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v4f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX7-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f16_e32 v2, v0
+; GFX8-NEXT:    v_rndne_f16_e32 v3, v1
+; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 16
+; GFX8-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
+; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, v4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, v4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
+  ret <4 x half> %roundeven
+}
+
+
+define float @v_roundeven_f32_fabs(float %x) {
+; GFX6-LABEL: v_roundeven_f32_fabs:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f32_fabs:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f32_fabs:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f32_fabs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %roundeven = call float @llvm.roundeven.f32(float %fabs.x)
+  ret float %roundeven
+}
+
+define amdgpu_ps float @s_roundeven_f32(float inreg %x) {
+; GFX6-LABEL: s_roundeven_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    v_rndne_f32_e32 v0, s0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_roundeven_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_rndne_f32_e32 v0, s0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_roundeven_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_rndne_f32_e32 v0, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_roundeven_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_rndne_f32_e32 v0, s0
+; GFX9-NEXT:    ; return to shader part epilog
+  %roundeven = call float @llvm.roundeven.f32(float %x)
+  ret float %roundeven
+}
+
+define float @v_roundeven_f32_fneg(float %x) {
+; GFX6-LABEL: v_roundeven_f32_fneg:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f32_fneg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f32_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f32_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg float %x
+  %roundeven = call float @llvm.roundeven.f32(float %neg.x)
+  ret float %roundeven
+}
+
+define double @v_roundeven_f64(double %x) {
+; GFX6-LABEL: v_roundeven_f64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
+; GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], v[2:3]
+; GFX6-NEXT:    s_mov_b32 s4, -1
+; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_add_f64 v[2:3], v[4:5], -v[2:3]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call double @llvm.roundeven.f64(double %x)
+  ret double %roundeven
+}
+
+define double @v_roundeven_f64_fneg(double %x) {
+; GFX6-LABEL: v_roundeven_f64_fneg:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
+; GFX6-NEXT:    v_and_b32_e32 v4, 0x80000000, v2
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
+; GFX6-NEXT:    v_or_b32_e32 v4, 0x43300000, v4
+; GFX6-NEXT:    v_add_f64 v[5:6], -v[0:1], v[3:4]
+; GFX6-NEXT:    v_mov_b32_e32 v1, v0
+; GFX6-NEXT:    s_mov_b32 s4, -1
+; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_add_f64 v[3:4], v[5:6], -v[3:4]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_f64_fneg:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_f64_fneg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_f64_fneg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg double %x
+  %roundeven = call double @llvm.roundeven.f64(double %neg.x)
+  ret double %roundeven
+}
+
+define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
+; GFX6-LABEL: v_roundeven_v2f64:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    s_mov_b32 s7, 0x43300000
+; GFX6-NEXT:    v_and_b32_e32 v5, s6, v1
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0
+; GFX6-NEXT:    v_or_b32_e32 v5, s7, v5
+; GFX6-NEXT:    v_add_f64 v[6:7], v[0:1], v[4:5]
+; GFX6-NEXT:    s_mov_b32 s4, -1
+; GFX6-NEXT:    s_mov_b32 s5, 0x432fffff
+; GFX6-NEXT:    v_add_f64 v[5:6], v[6:7], -v[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX6-NEXT:    v_and_b32_e32 v5, s6, v3
+; GFX6-NEXT:    v_or_b32_e32 v5, s7, v5
+; GFX6-NEXT:    v_add_f64 v[7:8], v[2:3], v[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX6-NEXT:    v_add_f64 v[4:5], v[7:8], -v[4:5]
+; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_roundeven_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX7-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_roundeven_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX8-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_roundeven_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
+  ret <2 x double> %roundeven
+}
+
+declare half @llvm.roundeven.f16(half) #0
+declare <2 x half> @llvm.roundeven.v2f16(<2 x half>) #0
+declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) #0
+
+declare float @llvm.roundeven.f32(float) #0
+declare <2 x float> @llvm.roundeven.v2f32(<2 x float>) #0
+declare <3 x float> @llvm.roundeven.v3f32(<3 x float>) #0
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0
+
+declare double @llvm.roundeven.f64(double) #0
+declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) #0
+
+declare half @llvm.fabs.f16(half) #0
+declare float @llvm.fabs.f32(float) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }

diff  --git a/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
new file mode 100644
index 000000000000..119821e91b38
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/roundeven.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+
+; FIXME: Calling convention lowering fails
+; define half @roundeven_f16(half %x) {
+;   %roundeven = call half @llvm.roundeven.f16(half %x)
+;   ret half %roundeven
+; }
+
+define float @roundeven_f32(float %x) {
+; CHECK-LABEL: roundeven_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq roundevenf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %roundeven = call float @llvm.roundeven.f32(float %x)
+  ret float %roundeven
+}
+
+define double @roundeven_f64(double %x) {
+; CHECK-LABEL: roundeven_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq roundeven
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %roundeven = call double @llvm.roundeven.f64(double %x)
+  ret double %roundeven
+}
+
+; FIXME: Insert fails
+; define x86_fp80 @roundeven_fp80(x86_fp80 %x) {
+;   %roundeven = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %x)
+;   ret x86_fp80 %roundeven
+; }
+
+define fp128 @roundeven_f128(fp128 %x) {
+; CHECK-LABEL: roundeven_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq roundevenl
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %roundeven = call fp128 @llvm.roundeven.f128(fp128 %x)
+  ret fp128 %roundeven
+}
+
+; FIXME: Fails on build_vector
+; define <4 x float> @roundeven_v4f32(<4 x float> %x) {
+;   %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
+;   ret <4 x float> %roundeven
+; }
+
+declare half @llvm.roundeven.f16(half) #0
+declare float @llvm.roundeven.f32(float) #0
+declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0
+declare double @llvm.roundeven.f64(double) #0
+declare x86_fp80 @llvm.roundeven.f80(x86_fp80) #0
+declare fp128 @llvm.roundeven.f128(fp128) #0
+
+attributes #0 = { nounwind readnone speculatable willreturn }


        


More information about the llvm-commits mailing list