[llvm] a7786ba - AMDGPU: Move zeroed FP high bits optimization to patterns
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 22 09:48:03 PDT 2021
Author: Matt Arsenault
Date: 2021-06-22T12:47:56-04:00
New Revision: a7786badb75b8c7cd425fefaeefc0a99fe8b49d8
URL: https://github.com/llvm/llvm-project/commit/a7786badb75b8c7cd425fefaeefc0a99fe8b49d8
DIFF: https://github.com/llvm/llvm-project/commit/a7786badb75b8c7cd425fefaeefc0a99fe8b49d8.diff
LOG: AMDGPU: Move zeroed FP high bits optimization to patterns
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.td
llvm/lib/Target/AMDGPU/SIInstructions.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1ebda1b526f3..3eae3171f111 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -44,6 +44,63 @@ class R600InstrInfo;
namespace {
+// Instructions that will be lowered with a final instruction that zeros the
+// high result bits.
+// XXX - only need to list legal operations.
+static bool fp16SrcZerosHighBits(unsigned Opc) {
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FMA:
+ case ISD::FMAD:
+ case ISD::FCANONICALIZE:
+ case ISD::FP_ROUND:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ return true;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
static bool isNullConstantOrUndef(SDValue V) {
if (V.isUndef())
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bf56ab4c0aa3..cadcf40f759b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4353,7 +4353,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -4483,8 +4482,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
}
- case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT: {
+ case AMDGPUISD::FP_TO_FP16: {
unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
@@ -4631,7 +4629,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16;
case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT:
return 16;
default:
return 1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b21a1e7498fb..dba01af02d06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -457,9 +457,6 @@ enum NodeType : unsigned {
// are known 0.
FP_TO_FP16,
- // Wrapper around fp16 results that are known to zero the high bits.
- FP16_ZEXT,
-
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index d63bd2e9eb2e..0f9cb712f820 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
-def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f477d79239e8..13e289c0a55d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9375,63 +9375,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - probably only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
@@ -9446,15 +9389,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16)
return SDValue();
- // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
- // FIXME: It is not universally true that the high bits are zeroed on gfx9.
- if (Src.getOpcode() == ISD::BITCAST) {
- SDValue BCSrc = Src.getOperand(0);
- if (BCSrc.getValueType() == MVT::f16 &&
- fp16SrcZerosHighBits(BCSrc.getOpcode()))
- return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
- }
-
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 92ff9946e811..25b647d34ec1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -814,6 +814,12 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
}], getNegV2I16Imm>;
+
+def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
+ return fp16SrcZerosHighBits(N->getOpcode());
+}]>;
+
+
//===----------------------------------------------------------------------===//
// MUBUF/SMEM Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index adf168e632c5..8e24ce00d163 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1992,11 +1992,13 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
-def : GCNPat <
- (i32 (AMDGPUfp16_zext f16:$src)),
- (COPY $src)
->;
+// Eliminate a zero extension from an fp16 operation if it already
+// zeros the high bits of the 32-bit register.
+def : GCNPat<
+ (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
+ (COPY VSrc_b16:$src)
+>;
def : GCNPat <
(i32 (trunc i64:$a)),
More information about the llvm-commits
mailing list