[llvm] 7638710 - [DAGCombiner] Require nsz for aggressive fma fold

Sun Mar 22 08:14:24 PDT 2020

Author: Qiu Chaofan
Date: 2020-03-22T23:10:07+08:00
New Revision: 763871053cc0c209e14f07f7f74bb8f1c16872d9

URL: https://github.com/llvm/llvm-project/commit/763871053cc0c209e14f07f7f74bb8f1c16872d9
DIFF: https://github.com/llvm/llvm-project/commit/763871053cc0c209e14f07f7f74bb8f1c16872d9.diff

LOG: [DAGCombiner] Require nsz for aggressive fma fold

For folding pattern `x-(fma y,z,u*v) -> (fma -y,z,(fma -u,v,x))`, if
`yz` is 1, `uv` is -1 and `x` is -0, sign of result would be changed.

Differential Revision: https://reviews.llvm.org/D76419

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/fma-combine.ll
    llvm/test/CodeGen/AMDGPU/mad-combine.ll
    llvm/test/CodeGen/PowerPC/fma-assoc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a6537a0bf97e..4129f2a6798a 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11957,6 +11957,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
@@ -12120,7 +12121,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2)) &&
-        N1->hasOneUse()) {
+        N1->hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,

diff  --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index a962a3b4ed06..5d78ddada14d 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -2,6 +2,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
 
+; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+
 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
 ; for correctness.
@@ -376,9 +378,10 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace
   %u = load volatile double, double addrspace(1)* %gep.3
   %v = load volatile double, double addrspace(1)* %gep.4
 
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
-  %tmp2 = fsub double %x, %tmp1
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0 = fmul nsz double %u, %v
+  %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub nsz double %x, %tmp1
 
   store double %tmp2, double addrspace(1)* %gep.out
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index 0b360f6ecefb..09bc371876fc 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
 
+; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+
 ; Make sure we don't form mad with denormals
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
@@ -566,9 +568,10 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(
   %u = load volatile float, float addrspace(1)* %gep.3
   %v = load volatile float, float addrspace(1)* %gep.4
 
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
-  %tmp2 = fsub float %x, %tmp1
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0 = fmul nsz float %u, %v
+  %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+  %tmp2 = fsub nsz float %x, %tmp1
 
   store float %tmp2, float addrspace(1)* %gep.out
   ret void

diff  --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll
index e43967a59ff3..a89972918862 100644
--- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll
@@ -331,15 +331,16 @@ define double @test_reassoc_FMSUB_ASSOC1(double %A, double %B, double %C,
 define double @test_reassoc_FMSUB_ASSOC2(double %A, double %B, double %C,
 ; CHECK-LABEL: test_reassoc_FMSUB_ASSOC2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fnmsub 0, 3, 4, 5
-; CHECK-NEXT:    fnmsub 1, 1, 2, 0
+; CHECK-NEXT:    fmul 0, 3, 4
+; CHECK-NEXT:    fmadd 0, 1, 2, 0
+; CHECK-NEXT:    fsub 1, 5, 0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC2:
 ; CHECK-VSX:       # %bb.0:
-; CHECK-VSX-NEXT:    xsnmsubmdp 3, 4, 5
-; CHECK-VSX-NEXT:    xsnmsubadp 3, 1, 2
-; CHECK-VSX-NEXT:    fmr 1, 3
+; CHECK-VSX-NEXT:    xsmuldp 0, 3, 4
+; CHECK-VSX-NEXT:    xsmaddadp 0, 1, 2
+; CHECK-VSX-NEXT:    xssubdp 1, 5, 0
 ; CHECK-VSX-NEXT:    blr
                                  double %D, double %E) {
   %F = fmul reassoc double %A, %B         ; <double> [#uses=1]