[llvm] AMDGPU: Use pattern to select instruction for intrinsic llvm.fptrunc.round (PR #105761)

Changpeng Fang via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 16:39:16 PDT 2024


https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/105761

>From 555f0ac21e261c8f6332595cc250350f9fd1d7ea Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Thu, 22 Aug 2024 17:01:15 -0700
Subject: [PATCH 1/5] AMDGPU: Use pattern to select instruction for
 llvm.fptrunc.round

  Use GCNPat insteam of Custom Lowering to select instructions for
intrinsic llvm.fptrunc.round. "SupportedRoundMode : TImmLeaf" is
used as a predicate to select only when the rounding mode is supported
by the hardware. "as_hw_round_mode : SDNodeXForm" is developed to
translate the round modes to the corresponding ones that hardware
recognizes.
---
 llvm/include/llvm/Support/TargetOpcodes.def   |   2 +-
 llvm/include/llvm/Target/GenericOpcodes.td    |   2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   3 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   3 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   3 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 -
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |   1 -
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   6 +
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   3 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  35 +---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h  |   1 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  31 +---
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   1 -
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  14 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  11 +-
 .../GlobalISel/legalizer-info-validation.mir  |   2 +-
 .../CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll |   7 +-
 .../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 158 +++++++++---------
 19 files changed, 122 insertions(+), 164 deletions(-)

diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 9fb6de49fb2055..80e3c90d346244 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -343,7 +343,7 @@ HANDLE_TARGET_OPCODE(G_FREEZE)
 HANDLE_TARGET_OPCODE(G_CONSTANT_FOLD_BARRIER)
 
 // INTRINSIC fptrunc_round intrinsic.
-HANDLE_TARGET_OPCODE(G_INTRINSIC_FPTRUNC_ROUND)
+HANDLE_TARGET_OPCODE(G_FPTRUNC_ROUND)
 
 /// INTRINSIC trunc intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_TRUNC)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 36a0a087ba457c..439600d940ed3e 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1141,7 +1141,7 @@ def G_RESET_FPMODE : GenericInstruction {
 //------------------------------------------------------------------------------
 // Opcodes for LLVM Intrinsics
 //------------------------------------------------------------------------------
-def G_INTRINSIC_FPTRUNC_ROUND : GenericInstruction {
+def G_FPTRUNC_ROUND : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src1, i32imm:$round_mode);
   let hasSideEffects = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f44af78cded46d..2f7315b48a03c8 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2522,8 +2522,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     // Add the Rounding mode as an integer
     MIRBuilder
-        .buildInstr(TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND,
-                    {getOrCreateVReg(CI)},
+        .buildInstr(TargetOpcode::G_FPTRUNC_ROUND, {getOrCreateVReg(CI)},
                     {getOrCreateVReg(*CI.getArgOperand(0))}, Flags)
         .addImm((int)*RoundMode);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3fece81df1f2fd..8224551ac76061 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5041,7 +5041,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
   case G_BITCAST:
     return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
-  case G_INTRINSIC_FPTRUNC_ROUND:
+  case G_FPTRUNC_ROUND:
     return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
   default:
     return UnableToLegalize;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 60dcb118542785..851c646b1cb333 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6957,8 +6957,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Result;
     Result = DAG.getNode(
         ISD::FPTRUNC_ROUND, sdl, VT, getValue(I.getArgOperand(0)),
-        DAG.getTargetConstant((int)*RoundMode, sdl,
-                              TLI.getPointerTy(DAG.getDataLayout())));
+        DAG.getTargetConstant((int)*RoundMode, sdl, MVT::i32));
     setValue(&I, Result);
 
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 8bee84b8a87f27..2fcb5727cb5e97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -419,3 +419,6 @@ def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameInde
 
 def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
   GISDNodeXFormEquiv<FPPow2ToExponentXForm>;
+
+def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
+  GISDNodeXFormEquiv<as_hw_round_mode>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e57c8f8b7b4835..4ace7d08ea59ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5500,7 +5500,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(LDS)
-  NODE_NAME_CASE(FPTRUNC_ROUND)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..dd9d97bd593bda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -553,7 +553,6 @@ enum NodeType : unsigned {
   CONST_DATA_PTR,
   PC_ADD_REL_OFFSET,
   LDS,
-  FPTRUNC_ROUND,
 
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 05ed1b322c0d1b..369165f82643c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5595,6 +5595,12 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
   MIB.addImm(ExpVal);
 }
 
+void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI,
+                                                int OpIdx) const {
+  MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
   return TII.isInlineConstant(Imm);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 69806b240cf2bc..40c6cf8fd3b51f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -359,6 +359,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI,
                               int OpIdx) const;
 
+  void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                       int OpIdx) const;
+
   bool isInlineImmediate(const APInt &Imm) const;
   bool isInlineImmediate(const APFloat &Imm) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4fd917f5ea7fa8..eaf540003ec6fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1136,8 +1136,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
 
-  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
-      .customFor({S16, S32})
+  getActionDefinitionsBuilder(G_FPTRUNC_ROUND)
+      .legalFor({S16, S32})
       .scalarize(0)
       .lower();
 
@@ -2179,8 +2179,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeCTLZ_CTTZ(MI, MRI, B);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
-  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
-    return legalizeFPTruncRound(MI, B);
   case TargetOpcode::G_STACKSAVE:
     return legalizeStackSave(MI, B);
   case TargetOpcode::G_GET_FPENV:
@@ -7093,35 +7091,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
-                                               MachineIRBuilder &B) const {
-  MachineRegisterInfo &MRI = *B.getMRI();
-  Register Src = MI.getOperand(1).getReg();
-  if (MRI.getType(Src) != LLT::scalar(32))
-    return false;
-
-  // Only support towardzero, tonearest, upward and downward.
-  int RoundMode = MI.getOperand(2).getImm();
-  if (RoundMode != (int)RoundingMode::TowardZero &&
-      RoundMode != (int)RoundingMode::NearestTiesToEven &&
-      RoundMode != (int)RoundingMode::TowardPositive &&
-      RoundMode != (int)RoundingMode::TowardNegative)
-    return false;
-
-  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
-  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
-  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
-  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
-  unsigned HW_Mode = (RoundMode + 3) % 4;
-  B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
-      .addDef(MI.getOperand(0).getReg())
-      .addUse(Src)
-      .addImm(HW_Mode);
-
-  MI.eraseFromParent();
-  return true;
-}
-
 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
                                             MachineIRBuilder &B) const {
   const SITargetLowering *TLI = ST.getTargetLowering();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index db1c5874093a71..a815e87a7da35f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -212,7 +212,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
 
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
-  bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
   bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c954c0aa71f734..e50bc2a3c42ef7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -598,8 +598,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     // F16 - VOP1 Actions.
     setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
-                        ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
-                       MVT::f16, Custom);
+                        ISD::FSIN, ISD::FROUND}, MVT::f16, Custom);
 
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
     setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -5796,8 +5795,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
-  case ISD::FPTRUNC_ROUND:
-    return lowerFPTRUNC_ROUND(Op, DAG);
   case ISD::TRAP:
     return lowerTRAP(Op, DAG);
   case ISD::DEBUGTRAP:
@@ -6647,30 +6644,6 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
                 DAG.getTargetConstant(0, DL, MVT::i32));
 }
 
-SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  if (Op.getOperand(0)->getValueType(0) != MVT::f32)
-    return SDValue();
-
-  // Only support towardzero, tonearest, upward and downward.
-  int RoundMode = Op.getConstantOperandVal(1);
-  if (RoundMode != (int)RoundingMode::TowardZero &&
-      RoundMode != (int)RoundingMode::NearestTiesToEven &&
-      RoundMode != (int)RoundingMode::TowardPositive &&
-      RoundMode != (int)RoundingMode::TowardNegative)
-    return SDValue();
-
-  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
-  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
-  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
-  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
-  unsigned HW_Mode = (RoundMode + 3) % 4;
-  SDLoc DL(Op);
-  SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32);
-  return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(),
-                     Op->getOperand(0), RoundFlag);
-}
-
 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f16 &&
          "Do not know how to custom lower FP_ROUND for non-f16 type");
@@ -12830,7 +12803,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
   case AMDGPU::G_FFLOOR:
   case AMDGPU::G_FRINT:
   case AMDGPU::G_FNEARBYINT:
-  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
+  case AMDGPU::G_FPTRUNC_ROUND:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
   case AMDGPU::G_FMA:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index eed4b3e79cdeee..1f198a92c0fa6a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -145,7 +145,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2b54429dc9a03f..1690e90957a707 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -308,7 +308,7 @@ def SDTFPRoundModeOp  : SDTypeProfile<1, 2, [
   SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 
-def SIfptrunc_round : SDNode<"AMDGPUISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
+def SIfptrunc_round : SDNode<"ISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
 
 //===----------------------------------------------------------------------===//
 // ValueType helpers
@@ -796,6 +796,18 @@ return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+def as_hw_round_mode : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N),
+                                    MVT::i32);
+}]>;
+
+def SupportedRoundMode : TImmLeaf<i32, [{
+  return Imm == (int)RoundingMode::TowardZero ||
+         Imm == (int)RoundingMode::NearestTiesToEven ||
+         Imm == (int)RoundingMode::TowardPositive ||
+         Imm == (int)RoundingMode::TowardNegative;
+}]>;
+
 class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
   uint64_t Imm = N->getZExtValue();
   unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e7831d00a3a4a8..22abaf4448a3af 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -229,10 +229,12 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
 // in the ModeRegister pass.
 let Uses = [MODE, EXEC] in {
 def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VGPR_32:$src0, i32imm:$round),
-  [(set f16:$vdst, (SIfptrunc_round f32:$src0, i32:$round))]>;
+  (ins VGPR_32:$src0, i32imm:$round)>;
 } // End Uses = [MODE, EXEC]
 
+def : GCNPat <(f16 (SIfptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
+     (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
+
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
 let Defs = [SCC], isConvergent = 1 in {
@@ -4064,11 +4066,6 @@ def G_SI_CALL : AMDGPUGenericInstruction {
   let isConvergent = 1;
 }
 
-def G_FPTRUNC_ROUND : AMDGPUGenericInstruction {
-  let OutOperandList = (outs type0:$vdst);
-  let InOperandList = (ins type1:$src0, untyped_imm_0:$round);
-  let hasSideEffects = 0;
-}
 
 //============================================================================//
 // Dummy Instructions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 87a415b45cca9a..5d0fd91566bcca 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -145,7 +145,7 @@
 # DEBUG-NEXT: .. the first uncovered imm index: {{[0-9]+}}, OK
 
 #
-# DEBUG-NEXT: G_INTRINSIC_FPTRUNC_ROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_FPTRUNC_ROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 #
diff --git a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
index 9fa3eb22a554a8..6454e04d2fa432 100644
--- a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
@@ -1,9 +1,8 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefixes=SDAG-FAIL
-; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=GISEL-FAIL
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
+; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
 
 define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
-; SDAG-FAIL: LLVM ERROR: Cannot select
-; GISEL-FAIL: unable to legalize instruction
+; FAIL: LLVM ERROR: Cannot select
   %res = call half @llvm.fptrunc.round.f64(double %a, metadata !"round.upward")
   store half %res, ptr addrspace(1) %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 71d0ee524bab73..54ed6f1eb42820 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -176,8 +176,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
   ret <2 x half> %res
@@ -197,8 +196,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
   ret <2 x half> %res
@@ -228,23 +226,18 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
-; GISEL-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
+; GISEL-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; GISEL-NEXT:    v_pk_add_f16 v0, v0, v1
-; GISEL-NEXT:    v_pk_add_f16 v0, v2, v0
+; GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
+; GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
 ; GISEL-NEXT:    global_store_dword v[4:5], v0, off
 ; GISEL-NEXT:    s_endpgm
   %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
@@ -295,31 +288,54 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
 }
 
 define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s2
-; CHECK-NEXT:    v_mov_b32_e32 v4, s1
-; CHECK-NEXT:    v_mov_b32_e32 v5, s3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v3
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v7, v5
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; CHECK-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; CHECK-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v4, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT:    v_pk_add_f16 v2, v2, v5
-; CHECK-NEXT:    v_pk_add_f16 v2, v3, v2
-; CHECK-NEXT:    global_store_dword v[0:1], v2, off
-; CHECK-NEXT:    s_endpgm
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; SDAG-NEXT:    v_mov_b32_e32 v5, s3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v6, v3
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SDAG-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; SDAG-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; SDAG-NEXT:    v_lshl_or_b32 v5, v7, 16, v6
+; SDAG-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT:    v_pk_add_f16 v2, v2, v5
+; SDAG-NEXT:    v_pk_add_f16 v2, v3, v2
+; SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; GISEL-NEXT:    v_pack_b32_f16 v2, v2, v3
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v4
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v5, v6, v7
+; GISEL-NEXT:    v_pack_b32_f16 v3, v3, v4
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GISEL-NEXT:    v_pk_add_f16 v2, v2, v5
+; GISEL-NEXT:    v_pk_add_f16 v2, v3, v2
+; GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GISEL-NEXT:    s_endpgm
   %res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
   %res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
   %res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
@@ -344,8 +360,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
@@ -367,8 +382,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
@@ -391,13 +405,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
   ret <4 x half> %res
@@ -419,13 +431,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
   ret <4 x half> %res
@@ -453,21 +463,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
   ret <8 x half> %res
@@ -495,21 +501,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT:    v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT:    v_pack_b32_f16 v3, v6, v7
 ; GISEL-NEXT:    ; return to shader part epilog
   %res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
   ret <8 x half> %res

>From ad98e1520d3d2227f33e30b5080c5a95d35ccbbc Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 23 Aug 2024 10:32:55 -0700
Subject: [PATCH 2/5]  AMDGPU: Use pattern to select instruction for
 llvm.fptrunc.round

  Don't do the renaming of G_INTRINSIC_FPTRUNC_ROUND;
  Don't change the type of round_mode opeand of FPTRINC_ROUND to
 MVT::i32.
---
 llvm/include/llvm/Support/TargetOpcodes.def                    | 2 +-
 llvm/include/llvm/Target/GenericOpcodes.td                     | 2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp                   | 3 ++-
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp                | 2 +-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp          | 3 ++-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td                          | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp                 | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp              | 2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp                      | 2 +-
 .../CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir   | 2 +-
 10 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 80e3c90d346244..9fb6de49fb2055 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -343,7 +343,7 @@ HANDLE_TARGET_OPCODE(G_FREEZE)
 HANDLE_TARGET_OPCODE(G_CONSTANT_FOLD_BARRIER)
 
 // INTRINSIC fptrunc_round intrinsic.
-HANDLE_TARGET_OPCODE(G_FPTRUNC_ROUND)
+HANDLE_TARGET_OPCODE(G_INTRINSIC_FPTRUNC_ROUND)
 
 /// INTRINSIC trunc intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_TRUNC)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 439600d940ed3e..36a0a087ba457c 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1141,7 +1141,7 @@ def G_RESET_FPMODE : GenericInstruction {
 //------------------------------------------------------------------------------
 // Opcodes for LLVM Intrinsics
 //------------------------------------------------------------------------------
-def G_FPTRUNC_ROUND : GenericInstruction {
+def G_INTRINSIC_FPTRUNC_ROUND : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src1, i32imm:$round_mode);
   let hasSideEffects = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 2f7315b48a03c8..f44af78cded46d 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2522,7 +2522,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     // Add the Rounding mode as an integer
     MIRBuilder
-        .buildInstr(TargetOpcode::G_FPTRUNC_ROUND, {getOrCreateVReg(CI)},
+        .buildInstr(TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND,
+                    {getOrCreateVReg(CI)},
                     {getOrCreateVReg(*CI.getArgOperand(0))}, Flags)
         .addImm((int)*RoundMode);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 8224551ac76061..3fece81df1f2fd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5041,7 +5041,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
   case G_BITCAST:
     return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
-  case G_FPTRUNC_ROUND:
+  case G_INTRINSIC_FPTRUNC_ROUND:
     return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
   default:
     return UnableToLegalize;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 851c646b1cb333..60dcb118542785 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6957,7 +6957,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Result;
     Result = DAG.getNode(
         ISD::FPTRUNC_ROUND, sdl, VT, getValue(I.getArgOperand(0)),
-        DAG.getTargetConstant((int)*RoundMode, sdl, MVT::i32));
+        DAG.getTargetConstant((int)*RoundMode, sdl,
+                              TLI.getPointerTy(DAG.getDataLayout())));
     setValue(&I, Result);
 
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2fcb5727cb5e97..91f7d5de46d8b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -297,7 +297,7 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
 def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
 
-def : GINodeEquiv<G_FPTRUNC_ROUND, SIfptrunc_round>;
+def : GINodeEquiv<G_INTRINSIC_FPTRUNC_ROUND, SIfptrunc_round>;
 
 class GISelSop2Pat <
   SDPatternOperator node,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index eaf540003ec6fe..3f6486d44f0ee5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1136,7 +1136,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0)
       .lower();
 
-  getActionDefinitionsBuilder(G_FPTRUNC_ROUND)
+  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
       .legalFor({S16, S32})
       .scalarize(0)
       .lower();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 69a1936a11fe05..4737a322c255f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5255,7 +5255,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
     break;
   }
-  case AMDGPU::G_FPTRUNC_ROUND:
+  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_PREFETCH:
     OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e50bc2a3c42ef7..6e5eb931100a35 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12803,7 +12803,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
   case AMDGPU::G_FFLOOR:
   case AMDGPU::G_FRINT:
   case AMDGPU::G_FNEARBYINT:
-  case AMDGPU::G_FPTRUNC_ROUND:
+  case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
   case AMDGPU::G_FMA:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 5d0fd91566bcca..87a415b45cca9a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -145,7 +145,7 @@
 # DEBUG-NEXT: .. the first uncovered imm index: {{[0-9]+}}, OK
 
 #
-# DEBUG-NEXT: G_FPTRUNC_ROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_INTRINSIC_FPTRUNC_ROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 #

>From 0767994fb5cee9e6fa3d934e5139712be321d69b Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Fri, 23 Aug 2024 11:05:15 -0700
Subject: [PATCH 3/5] AMDGPU: Use pattern to select instruction for intrinsic
 llvm.fptrunc.round

  Still need to change the round_mode operand type to i32 for FPTRUNC_ROUND,
otherwise we can not do the paterrn matching (or not easily).
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 60dcb118542785..851c646b1cb333 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6957,8 +6957,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Result;
     Result = DAG.getNode(
         ISD::FPTRUNC_ROUND, sdl, VT, getValue(I.getArgOperand(0)),
-        DAG.getTargetConstant((int)*RoundMode, sdl,
-                              TLI.getPointerTy(DAG.getDataLayout())));
+        DAG.getTargetConstant((int)*RoundMode, sdl, MVT::i32));
     setValue(&I, Result);
 
     return;

>From df80760143b15f19d4135050c5adb904821308d4 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 28 Aug 2024 13:57:41 -0700
Subject: [PATCH 4/5] AMDGPU: Rename fail.llvm.fptrunc.round.ll to
 llvm.fptrunc.round.err.ll

---
 .../{fail.llvm.fptrunc.round.ll => llvm.fptrunc.round.err.ll} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{fail.llvm.fptrunc.round.ll => llvm.fptrunc.round.err.ll} (75%)

diff --git a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
similarity index 75%
rename from llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
rename to llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
index 6454e04d2fa432..f1d5b07e832c48 100644
--- a/llvm/test/CodeGen/AMDGPU/fail.llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
@@ -3,9 +3,9 @@
 
 define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
 ; FAIL: LLVM ERROR: Cannot select
-  %res = call half @llvm.fptrunc.round.f64(double %a, metadata !"round.upward")
+  %res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward")
   store half %res, ptr addrspace(1) %out, align 4
   ret void
 }
 
-declare half @llvm.fptrunc.round.f64(double, metadata)
+declare half @llvm.fptrunc.round.f16.f64(double, metadata)

>From 15c09f7fcc9cc7c995fb26908a3338125c56c94b Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 28 Aug 2024 16:36:58 -0700
Subject: [PATCH 5/5] AMDGPU: Use pattern to select instruction for intrinsic
 llvm.fptrunc.round

 Add comments to explain the rounding mode mapping.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 4 ++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.td                | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3562f03b4af664..3fcb364fc2c536 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5597,6 +5597,10 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
                                                 const MachineInstr &MI,
                                                 int OpIdx) const {
+  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
+  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
+  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
   MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 1690e90957a707..4f7fbcb7067e2f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -797,6 +797,10 @@ return CurDAG->getTargetConstant(
 }]>;
 
 def as_hw_round_mode : SDNodeXForm<timm, [{
+  // "round.towardzero" -> TowardZero 0        -> FP_ROUND_ROUND_TO_ZERO 3
+  // "round.tonearest"  -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+  // "round.upward"     -> TowardPositive 2    -> FP_ROUND_ROUND_TO_INF 1
+  // "round.downward    -> TowardNegative 3    -> FP_ROUND_ROUND_TO_NEGINF 2
   return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N),
                                     MVT::i32);
 }]>;



More information about the llvm-commits mailing list