[llvm] e6605a2 - DAG: Fix wrong legality check for ISD::FMAD

Mon Apr 13 10:26:02 PDT 2020

Author: Matt Arsenault
Date: 2020-04-13T10:25:39-07:00
New Revision: e6605a209cc7785ae635fee1df7ff4c136d35c17

URL: https://github.com/llvm/llvm-project/commit/e6605a209cc7785ae635fee1df7ff4c136d35c17
DIFF: https://github.com/llvm/llvm-project/commit/e6605a209cc7785ae635fee1df7ff4c136d35c17.diff

LOG: DAG: Fix wrong legality check for ISD::FMAD

Since 1725f2884175ca618d29b06e35f5c6ebd618053d, this should check
isFMADLegalForFAddFSub rather than the the plain isOperationLegal.

This would assert in a subset of cases due to an oddity in how FMAD is
selected. We will allow FMA formation pre-legalize, but not FMAD even
in cases where it would be valid.

The current hook requires passing in the root fadd/fsub. However, in
this distributed case, this would be far more complicated to pass in
the relevant operand. AMDGPU doesn't get any value from the node, and
only needs the type and is the only implementor, so I'm not sure why
we have this complexity. Just rename and expand the assert to avoid
the more complicated checks spread through the distribution logic.

Added: 
    llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index df8799fd72aa..4eb8ba73bcab 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2642,11 +2642,13 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Returns true if the FADD or FSUB node passed could legally be combined with
-  /// an fmul to form an ISD::FMAD.
-  virtual bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                      const SDNode *N) const {
-    assert(N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB);
+  /// Returns true if be combined with to form an ISD::FMAD. \p N may be an
+  /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an
+  /// fadd/fsub.
+  virtual bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const {
+    assert((N->getOpcode() == ISD::FADD || N->getOpcode() == ISD::FSUB ||
+            N->getOpcode() == ISD::FMUL) &&
+           "unexpected node in FMAD forming combine");
     return isOperationLegal(ISD::FMAD, N->getValueType(0));
   }
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 60701af11ad1..c61c795988a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11728,7 +11728,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -11945,7 +11945,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   const TargetOptions &Options = DAG.getTarget().Options;
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -12289,7 +12289,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
   bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+                 (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 44306cf9de39..196e361b2bc4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3946,8 +3946,8 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
-bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                              const SDNode *N) const {
+bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
+                                   const SDNode *N) const {
   // TODO: Check future ftz flag
   // v_mad_f32/v_mac_f32 do not support denormals.
   EVT VT = N->getValueType(0);

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1e02fcdcedbf..226003423889 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -358,8 +358,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
-  bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                              const SDNode *N) const override;
+  bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
new file mode 100644
index 000000000000..c1e9f6e74af0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,NOFUSE %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN,FMA %s
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,FMAD %s
+
+; Check for incorrect fmad formation when distributing
+
+define float @unsafe_fmul_fadd_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v1, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg1, 1.0
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v1, 1.0, v1
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v1, v0, v0
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub fast float 1.0, %arg1
+  %tmp1 = fmul fast float %arg0, %add
+  ret float %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fadd_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mac_f32_e32 v0, v2, v0
+; FMAD-NEXT:    v_mac_f32_e32 v1, v3, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast <2 x float> %arg1, <float 1.0, float 1.0>
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v2, v0, v0
+; FMA-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v3, 1.0, v3
+; NOFUSE-NEXT:    v_sub_f32_e32 v2, 1.0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v2, v0, v0
+; FMAD-NEXT:    v_mad_f32 v1, -v3, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fsub  fast <2 x float> <float 1.0, float 1.0>, %arg1
+  %tmp1 = fmul fast <2 x float> %arg0, %add
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fadd_distribute_post_legalize_f32(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd fast float %arg0, 1.0
+  %splat = insertelement <2 x float> undef, float %add, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <2 x float> %arg1) #0 {
+; FMA-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMA:       ; %bb.0:
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMA-NEXT:    s_setpc_b64 s[30:31]
+;
+; NOFUSE-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; NOFUSE:       ; %bb.0:
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOFUSE-NEXT:    v_sub_f32_e32 v0, 1.0, v0
+; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
+;
+; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
+; FMAD:       ; %bb.0:
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FMAD-NEXT:    v_mad_f32 v0, -v0, v1, v1
+; FMAD-NEXT:    s_setpc_b64 s[30:31]
+  %sub = fsub fast float 1.0, %arg0
+  %splat = insertelement <2 x float> undef, float %sub, i32 0
+  %tmp1 = fmul fast <2 x float> %arg1, %splat
+  ret <2 x float> %tmp1
+}
+
+attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }