[llvm] 9ad8a1f - AMDGPU: Fix high 16-bit optimization on gfx9

Tue Jun 22 10:16:51 PDT 2021

Author: Matt Arsenault
Date: 2021-06-22T13:16:45-04:00
New Revision: 9ad8a1f6fb2aea775736cd59129b7299be443c5c

URL: https://github.com/llvm/llvm-project/commit/9ad8a1f6fb2aea775736cd59129b7299be443c5c
DIFF: https://github.com/llvm/llvm-project/commit/9ad8a1f6fb2aea775736cd59129b7299be443c5c.diff

LOG: AMDGPU: Fix high 16-bit optimization on gfx9

We can do this optimization in the majority of cases, but we currently
don't have a way to do it. We do not track/model which instructions
have which behavior, the control bit to change the high bit behavior,
or making use of preserved bits at all. This is a bit fuzzy since we
don't know precisely how the source instruction will be lowered, but
that only really matters in one case (for fma_mixlo).

We do need to fixup some of these cases after selection, but the
pattern helps eliminate many of these zexts.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/SIInstructions.td
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/preserve-hi16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 3eae3171f111..a3106ded1e38 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -44,63 +44,6 @@ class R600InstrInfo;
 
 namespace {
 
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
-  switch (Opc) {
-  case ISD::FADD:
-  case ISD::FSUB:
-  case ISD::FMUL:
-  case ISD::FDIV:
-  case ISD::FREM:
-  case ISD::FMA:
-  case ISD::FMAD:
-  case ISD::FCANONICALIZE:
-  case ISD::FP_ROUND:
-  case ISD::UINT_TO_FP:
-  case ISD::SINT_TO_FP:
-  case ISD::FABS:
-    // Fabs is lowered to a bit operation, but it's an and which will clear the
-    // high bits anyway.
-  case ISD::FSQRT:
-  case ISD::FSIN:
-  case ISD::FCOS:
-  case ISD::FPOWI:
-  case ISD::FPOW:
-  case ISD::FLOG:
-  case ISD::FLOG2:
-  case ISD::FLOG10:
-  case ISD::FEXP:
-  case ISD::FEXP2:
-  case ISD::FCEIL:
-  case ISD::FTRUNC:
-  case ISD::FRINT:
-  case ISD::FNEARBYINT:
-  case ISD::FROUND:
-  case ISD::FFLOOR:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
-  case AMDGPUISD::FRACT:
-  case AMDGPUISD::CLAMP:
-  case AMDGPUISD::COS_HW:
-  case AMDGPUISD::SIN_HW:
-  case AMDGPUISD::FMIN3:
-  case AMDGPUISD::FMAX3:
-  case AMDGPUISD::FMED3:
-  case AMDGPUISD::FMAD_FTZ:
-  case AMDGPUISD::RCP:
-  case AMDGPUISD::RSQ:
-  case AMDGPUISD::RCP_IFLAG:
-  case AMDGPUISD::LDEXP:
-    return true;
-  default:
-    // fcopysign, select and others may be lowered to 32-bit bit operations
-    // which don't zero the high bits.
-    return false;
-  }
-}
-
 static bool isNullConstantOrUndef(SDValue V) {
   if (V.isUndef())
     return true;
@@ -164,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool EnableLateStructurizeCFG;
 
+  // Instructions that will be lowered with a final instruction that zeros the
+  // high result bits.
+  bool fp16SrcZerosHighBits(unsigned Opc) const;
+
 public:
   explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
                               CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -459,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+  // XXX - only need to list legal operations.
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FCANONICALIZE:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::FABS:
+    // Fabs is lowered to a bit operation, but it's an and which will clear the
+    // high bits anyway.
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::COS_HW:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::RCP_IFLAG:
+  case AMDGPUISD::LDEXP:
+    // On gfx10, all 16-bit instructions preserve the high bits.
+    return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+  case ISD::FP_ROUND:
+    // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+    // high bits on gfx9.
+    // TODO: If we had the source node we could see if the source was fma/mad
+    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  case ISD::FMA:
+  case ISD::FMAD:
+  case AMDGPUISD::DIV_FIXUP:
+    return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  default:
+    // fcopysign, select and others may be lowered to 32-bit bit operations
+    // which don't zero the high bits.
+    return false;
+  }
+}
+
 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   assert(Subtarget->d16PreservesUnusedBits());
   MVT VT = N->getValueType(0).getSimpleVT();

diff  --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8e24ce00d163..9778a6f79f39 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1995,10 +1995,16 @@ def : GCNPat <
 
 // Eliminate a zero extension from an fp16 operation if it already
 // zeros the high bits of the 32-bit register.
+//
+// This is complicated on gfx9+. Some instructions maintain the legacy
+// zeroing behavior, but others preserve the high bits. Some have a
+// control bit to change the behavior. We can't simply say with
+// certainty what the source behavior is without more context on how
+// the src is lowered. e.g. fptrunc + fma may be lowered to a
+// v_fma_mix* instruction which does not zero, or may not.
 def : GCNPat<
   (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
-  (COPY VSrc_b16:$src)
->;
+  (COPY VSrc_b16:$src)>;
 
 def : GCNPat <
   (i32 (trunc i64:$a)),

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index be53ffe52918..abdfd2c9c677 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -140,7 +140,8 @@ entry:
 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
 ; GCN: buffer_store_dword v[[R_F16]]
 define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
     i32 addrspace(1)* %r,
@@ -157,7 +158,8 @@ entry:
 ; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
-; GCN-NOT: v[[R_F16]]
+; SIVI-NOT: v[[R_F16]]
+; GFX9-NEXT: v_and_b32_e32 v[[R_F16]], 0xffff, v[[R_F16]]
 ; GCN: buffer_store_dword v[[R_F16]]
 define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
     i32 addrspace(1)* %r,

diff  --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 6fc0ce9575d4..ed2202c51028 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX906 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
 ; GCN-LABEL: {{^}}shl_i16:
@@ -188,3 +190,93 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
   %zext = zext i16 %res to i32
   ret i32 %zext
 }
+
+; GCN-LABEL: {{^}}zext_fadd_f16:
+; GFX8: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; GFX10: v_add_f16_e32 [[ADD:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[ADD]]
+define i32 @zext_fadd_f16(half %x, half %y) {
+  %add = fadd half %x, %y
+  %cast = bitcast half %add to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fma_f16:
+; GFX8: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_fma_f16 [[FMA:v[0-9]+]], v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+
+; GFX10: v_fmac_f16_e32 [[FMA:v[0-9]+]], v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, [[FMA]]
+define i32 @zext_fma_f16(half %x, half %y, half %z) {
+  %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
+  %cast = bitcast half %fma to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_div_fixup_f16:
+; GFX8: v_div_fixup_f16 v0, v0, v1, v2
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_div_fixup_f16 v0, v0, v1, v2
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_div_fixup_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
+  %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z)
+  %cast = bitcast half %div.fixup to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; We technically could eliminate the and on gfx9 here but we don't try
+; to inspect the source of the fptrunc. We're only worried about cases
+; that lower to v_fma_mix* instructions.
+
+; GCN-LABEL: {{^}}zext_fptrunc_f16:
+; GFX8: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: s_setpc_b64
+
+; GFX9: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_f16(float %x) {
+  %fptrunc = fptrunc float %x to half
+  %cast = bitcast half %fptrunc to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}zext_fptrunc_fma_f16:
+; GFX900: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
+
+; GFX10: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
+  %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
+  %fptrunc = fptrunc float %fma to half
+  %cast = bitcast half %fptrunc to i16
+  %zext = zext i16 %cast to i32
+  ret i32 %zext
+}
+
+declare half @llvm.amdgcn.div.fixup.f16(half, half, half)
+declare half @llvm.fma.f16(half, half, half)
+declare float @llvm.fma.f32(float, float, float)