[llvm] 9ad8a1f - AMDGPU: Fix high 16-bit optimization on gfx9
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 22 10:16:51 PDT 2021
Author: Matt Arsenault
Date: 2021-06-22T13:16:45-04:00
New Revision: 9ad8a1f6fb2aea775736cd59129b7299be443c5c
URL: https://github.com/llvm/llvm-project/commit/9ad8a1f6fb2aea775736cd59129b7299be443c5c
DIFF: https://github.com/llvm/llvm-project/commit/9ad8a1f6fb2aea775736cd59129b7299be443c5c.diff
LOG: AMDGPU: Fix high 16-bit optimization on gfx9
We can do this optimization in the majority of cases, but we currently
don't have a way to do it. We do not track/model which instructions
have which behavior, the control bit to change the high bit behavior,
or making use of preserved bits at all. This is a bit fuzzy since we
don't know precisely how the source instruction will be lowered, but
that only really matters in one case (for fma_mixlo).
We do need to fixup some of these cases after selection, but the
pattern helps eliminate many of these zexts.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/SIInstructions.td
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 3eae3171f111..a3106ded1e38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -44,63 +44,6 @@ class R600InstrInfo;
namespace {
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
static bool isNullConstantOrUndef(SDValue V) {
if (V.isUndef())
return true;
@@ -164,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool EnableLateStructurizeCFG;
+ // Instructions that will be lowered with a final instruction that zeros the
+ // high result bits.
+ bool fp16SrcZerosHighBits(unsigned Opc) const;
+
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -459,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+ // XXX - only need to list legal operations.
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCANONICALIZE:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+ case ISD::FP_ROUND:
+ // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+ // high bits on gfx9.
+ // TODO: If we had the source node we could see if the source was fma/mad
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case ISD::FMA:
+ case ISD::FMAD:
+ case AMDGPUISD::DIV_FIXUP:
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8e24ce00d163..9778a6f79f39 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1995,10 +1995,16 @@ def : GCNPat <
// Eliminate a zero extension from an fp16 operation if it already
// zeros the high bits of the 32-bit register.
+//
+// This is complicated on gfx9+. Some instructions maintain the legacy
+// zeroing behavior, but others preserve the high bits. Some have a
+// control bit to change the behavior. We can't simply say with
+// certainty what the source behavior is without more context on how
+// the src is lowered. e.g. fptrunc + fma may be lowered to a
+// v_fma_mix* instruction which does not zero, or may not.
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
- (COPY VSrc_b16:$src)
->;
+ (COPY VSrc_b16:$src)>;
def : GCNPat <
(i32 (trunc i64:$a)),
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index be53ffe52918..abdfd2c9c677 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -140,7 +140,8 @@ entry:
; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
; GCN: buffer_store_dword v[[R_F16]]
define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
i32 addrspace(1)* %r,
@@ -157,7 +158,8 @@ entry:
; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
; GCN: buffer_store_dword v[[R_F16]]
define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
i32 addrspace(1)* %r,
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 6fc0ce9575d4..ed2202c51028 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX906 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}shl_i16:
@@ -188,3 +190,93 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
%zext = zext i16 %res to i32
ret i32 %zext
}
+
+; GCN-LABEL: {{^}}zext_fadd_f16:
+; GFX8: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; GFX10: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[ADD]]
+define i32 @zext_fadd_f16(half %x, half %y) {
+ %add = fadd half %x, %y
+ %cast = bitcast half %add to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fma_f16:
+; GFX8: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+
+; GFX10: v_fmac_f16_e32 [[FMA:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+define i32 @zext_fma_f16(half %x, half %y, half %z) {
+ %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
+ %cast = bitcast half %fma to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_div_fixup_f16:
+; GFX8: v_div_fixup_f16 v0, v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_div_fixup_f16 v0, v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_div_fixup_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
+ %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z)
+ %cast = bitcast half %div.fixup to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; We technically could eliminate the and on gfx9 here but we don't try
+; to inspect the source of the fptrunc. We're only worried about cases
+; that lower to v_fma_mix* instructions.
+
+; GCN-LABEL: {{^}}zext_fptrunc_f16:
+; GFX8: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_f16(float %x) {
+ %fptrunc = fptrunc float %x to half
+ %cast = bitcast half %fptrunc to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fptrunc_fma_f16:
+; GFX900: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
+ %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
+ %fptrunc = fptrunc float %fma to half
+ %cast = bitcast half %fptrunc to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
+declare half @llvm.fma.f16(half, half, half)
+declare float @llvm.fma.f32(float, float, float)
More information about the llvm-commits
mailing list