[llvm] AMDGPU: Use pattern to select instruction for intrinsic llvm.fptrunc.round (PR #105761)
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 17:25:34 PDT 2024
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/105761
>From b43875cf6c3e02ca9a3908b62bf0d1dad06d230a Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Thu, 22 Aug 2024 17:01:15 -0700
Subject: [PATCH] AMDGPU: Use pattern to select instruction for
llvm.fptrunc.round
Use GCNPat insteam of Custom Lowering to select instructions for
intrinsic llvm.fptrunc.round. "SupportedRoundMode : TImmLeaf" is
used as a predicate to select only when the rounding mode is supported
by the hardware. "as_hw_round_mode : SDNodeXForm" is developed to
translate the round modes to the corresponding ones that hardware
recognizes.
---
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 5 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 -
.../AMDGPU/AMDGPUInstructionSelector.cpp | 10 ++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 33 +---
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h | 1 -
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +---
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 -
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 18 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 11 +-
.../CodeGen/AMDGPU/llvm.fptrunc.round.err.ll | 7 +-
.../test/CodeGen/AMDGPU/llvm.fptrunc.round.ll | 158 +++++++++---------
14 files changed, 124 insertions(+), 156 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 8bee84b8a87f27..91f7d5de46d8b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -297,7 +297,7 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
-def : GINodeEquiv<G_FPTRUNC_ROUND, SIfptrunc_round>;
+def : GINodeEquiv<G_INTRINSIC_FPTRUNC_ROUND, SIfptrunc_round>;
class GISelSop2Pat <
SDPatternOperator node,
@@ -419,3 +419,6 @@ def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameInde
def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
GISDNodeXFormEquiv<FPPow2ToExponentXForm>;
+
+def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
+ GISDNodeXFormEquiv<as_hw_round_mode>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..04009273038d7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5506,7 +5506,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
NODE_NAME_CASE(LDS)
- NODE_NAME_CASE(FPTRUNC_ROUND)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..dd9d97bd593bda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -553,7 +553,6 @@ enum NodeType : unsigned {
CONST_DATA_PTR,
PC_ADD_REL_OFFSET,
LDS,
- FPTRUNC_ROUND,
DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 17071970ca4bfe..3fcb364fc2c536 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5594,6 +5594,16 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
MIB.addImm(ExpVal);
}
+void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
+ // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+ // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
+ // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
+ MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
+}
+
bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
return TII.isInlineConstant(Imm);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 207cd67f0eda0e..068db5c1c14496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -359,6 +359,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
bool isInlineImmediate(const APInt &Imm) const;
bool isInlineImmediate(const APFloat &Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 4fd917f5ea7fa8..3f6486d44f0ee5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1137,7 +1137,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
- .customFor({S16, S32})
+ .legalFor({S16, S32})
.scalarize(0)
.lower();
@@ -2179,8 +2179,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeCTLZ_CTTZ(MI, MRI, B);
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
- case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
- return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
return legalizeStackSave(MI, B);
case TargetOpcode::G_GET_FPENV:
@@ -7093,35 +7091,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
- MachineIRBuilder &B) const {
- MachineRegisterInfo &MRI = *B.getMRI();
- Register Src = MI.getOperand(1).getReg();
- if (MRI.getType(Src) != LLT::scalar(32))
- return false;
-
- // Only support towardzero, tonearest, upward and downward.
- int RoundMode = MI.getOperand(2).getImm();
- if (RoundMode != (int)RoundingMode::TowardZero &&
- RoundMode != (int)RoundingMode::NearestTiesToEven &&
- RoundMode != (int)RoundingMode::TowardPositive &&
- RoundMode != (int)RoundingMode::TowardNegative)
- return false;
-
- // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
- // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
- // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
- // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
- unsigned HW_Mode = (RoundMode + 3) % 4;
- B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
- .addDef(MI.getOperand(0).getReg())
- .addUse(Src)
- .addImm(HW_Mode);
-
- MI.eraseFromParent();
- return true;
-}
-
bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
MachineIRBuilder &B) const {
const SITargetLowering *TLI = ST.getTargetLowering();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index db1c5874093a71..a815e87a7da35f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -212,7 +212,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
- bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 69a1936a11fe05..4737a322c255f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5255,7 +5255,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
break;
}
- case AMDGPU::G_FPTRUNC_ROUND:
+ case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
return getDefaultMappingVOP(MI);
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..4b4ed2f56db022 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -598,8 +598,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP1 Actions.
setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
- ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
- MVT::f16, Custom);
+ ISD::FSIN, ISD::FROUND}, MVT::f16, Custom);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -5797,8 +5796,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND:
return lowerFP_ROUND(Op, DAG);
- case ISD::FPTRUNC_ROUND:
- return lowerFPTRUNC_ROUND(Op, DAG);
case ISD::TRAP:
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
@@ -6648,30 +6645,6 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
DAG.getTargetConstant(0, DL, MVT::i32));
}
-SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op,
- SelectionDAG &DAG) const {
- if (Op.getOperand(0)->getValueType(0) != MVT::f32)
- return SDValue();
-
- // Only support towardzero, tonearest, upward and downward.
- int RoundMode = Op.getConstantOperandVal(1);
- if (RoundMode != (int)RoundingMode::TowardZero &&
- RoundMode != (int)RoundingMode::NearestTiesToEven &&
- RoundMode != (int)RoundingMode::TowardPositive &&
- RoundMode != (int)RoundingMode::TowardNegative)
- return SDValue();
-
- // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
- // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
- // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
- // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
- unsigned HW_Mode = (RoundMode + 3) % 4;
- SDLoc DL(Op);
- SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32);
- return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(),
- Op->getOperand(0), RoundFlag);
-}
-
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f16 &&
"Do not know how to custom lower FP_ROUND for non-f16 type");
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index eed4b3e79cdeee..1f198a92c0fa6a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -145,7 +145,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2b54429dc9a03f..4f7fbcb7067e2f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -308,7 +308,7 @@ def SDTFPRoundModeOp : SDTypeProfile<1, 2, [
SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
]>;
-def SIfptrunc_round : SDNode<"AMDGPUISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
+def SIfptrunc_round : SDNode<"ISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
//===----------------------------------------------------------------------===//
// ValueType helpers
@@ -796,6 +796,22 @@ return CurDAG->getTargetConstant(
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
}]>;
+def as_hw_round_mode : SDNodeXForm<timm, [{
+ // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
+ // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
+ // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
+ // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
+ return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N),
+ MVT::i32);
+}]>;
+
+def SupportedRoundMode : TImmLeaf<i32, [{
+ return Imm == (int)RoundingMode::TowardZero ||
+ Imm == (int)RoundingMode::NearestTiesToEven ||
+ Imm == (int)RoundingMode::TowardPositive ||
+ Imm == (int)RoundingMode::TowardNegative;
+}]>;
+
class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
uint64_t Imm = N->getZExtValue();
unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 814d3182fb5df8..48e79124ac29dc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -229,10 +229,12 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
// in the ModeRegister pass.
let Uses = [MODE, EXEC] in {
def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VGPR_32:$src0, i32imm:$round),
- [(set f16:$vdst, (SIfptrunc_round f32:$src0, i32:$round))]>;
+ (ins VGPR_32:$src0, i32imm:$round)>;
} // End Uses = [MODE, EXEC]
+def : GCNPat <(f16 (SIfptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
+ (FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
+
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
@@ -4055,11 +4057,6 @@ def G_SI_CALL : AMDGPUGenericInstruction {
let isConvergent = 1;
}
-def G_FPTRUNC_ROUND : AMDGPUGenericInstruction {
- let OutOperandList = (outs type0:$vdst);
- let InOperandList = (ins type1:$src0, untyped_imm_0:$round);
- let hasSideEffects = 0;
-}
//============================================================================//
// Dummy Instructions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
index 4bcd0cf5e6a0e5..f1d5b07e832c48 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.err.ll
@@ -1,9 +1,8 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefixes=SDAG-FAIL
-; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=GISEL-FAIL
+; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
+; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
-; SDAG-FAIL: LLVM ERROR: Cannot select
-; GISEL-FAIL: unable to legalize instruction
+; FAIL: LLVM ERROR: Cannot select
%res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward")
store half %res, ptr addrspace(1) %out, align 4
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
index 71d0ee524bab73..54ed6f1eb42820 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fptrunc.round.ll
@@ -176,8 +176,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: ; return to shader part epilog
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
ret <2 x half> %res
@@ -197,8 +196,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: ; return to shader part epilog
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
ret <2 x half> %res
@@ -228,23 +226,18 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v1, v7, 16, v6
-; GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
+; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v3
+; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
+; GISEL-NEXT: v_pack_b32_f16 v1, v1, v2
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; GISEL-NEXT: v_pk_add_f16 v0, v0, v1
-; GISEL-NEXT: v_pk_add_f16 v0, v2, v0
+; GISEL-NEXT: v_pk_add_f16 v0, v0, v3
+; GISEL-NEXT: v_pk_add_f16 v0, v1, v0
; GISEL-NEXT: global_store_dword v[4:5], v0, off
; GISEL-NEXT: s_endpgm
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
@@ -295,31 +288,54 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
}
define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
-; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: v_mov_b32_e32 v3, s2
-; CHECK-NEXT: v_mov_b32_e32 v4, s1
-; CHECK-NEXT: v_mov_b32_e32 v5, s3
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
-; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v3
-; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CHECK-NEXT: v_cvt_f16_f32_e32 v7, v5
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; CHECK-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; CHECK-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; CHECK-NEXT: v_lshl_or_b32 v2, v4, 16, v2
-; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v5
-; CHECK-NEXT: v_lshl_or_b32 v5, v7, 16, v6
-; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
-; CHECK-NEXT: v_pk_add_f16 v2, v2, v5
-; CHECK-NEXT: v_pk_add_f16 v2, v3, v2
-; CHECK-NEXT: global_store_dword v[0:1], v2, off
-; CHECK-NEXT: s_endpgm
+; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: v_mov_b32_e32 v2, s0
+; SDAG-NEXT: v_mov_b32_e32 v3, s2
+; SDAG-NEXT: v_mov_b32_e32 v4, s1
+; SDAG-NEXT: v_mov_b32_e32 v5, s3
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v3
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4
+; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v5
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SDAG-NEXT: v_lshl_or_b32 v2, v4, 16, v2
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v5
+; SDAG-NEXT: v_lshl_or_b32 v5, v7, 16, v6
+; SDAG-NEXT: v_lshl_or_b32 v3, v4, 16, v3
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; SDAG-NEXT: v_pk_add_f16 v2, v2, v5
+; SDAG-NEXT: v_pk_add_f16 v2, v3, v2
+; SDAG-NEXT: global_store_dword v[0:1], v2, off
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GISEL-NEXT: v_mov_b32_e32 v4, s2
+; GISEL-NEXT: v_mov_b32_e32 v5, s3
+; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v4
+; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GISEL-NEXT: v_pack_b32_f16 v2, v2, v3
+; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
+; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GISEL-NEXT: v_pack_b32_f16 v5, v6, v7
+; GISEL-NEXT: v_pack_b32_f16 v3, v3, v4
+; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
+; GISEL-NEXT: v_pk_add_f16 v2, v2, v5
+; GISEL-NEXT: v_pk_add_f16 v2, v3, v2
+; GISEL-NEXT: global_store_dword v[0:1], v2, off
+; GISEL-NEXT: s_endpgm
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
%res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
%res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
@@ -344,8 +360,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
; GISEL-NEXT: ; return to shader part epilog
%res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
@@ -367,8 +382,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
; GISEL-NEXT: ; return to shader part epilog
%res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
@@ -391,13 +405,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
; GISEL-NEXT: ; return to shader part epilog
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
ret <4 x half> %res
@@ -419,13 +431,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
; GISEL-NEXT: ; return to shader part epilog
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
ret <4 x half> %res
@@ -453,21 +463,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
; GISEL-NEXT: ; return to shader part epilog
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
ret <8 x half> %res
@@ -495,21 +501,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
-; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
-; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
+; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
+; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5
+; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
; GISEL-NEXT: ; return to shader part epilog
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
ret <8 x half> %res
More information about the llvm-commits
mailing list