[llvm] 6dbd458 - AMDGPU: Remove pointless libcall optimization of fma/mad

Wed Aug 9 16:38:12 PDT 2023

Author: Matt Arsenault
Date: 2023-08-09T19:37:52-04:00
New Revision: 6dbd458128dd24d35c0fe9121bde62f0c4da92f8

URL: https://github.com/llvm/llvm-project/commit/6dbd458128dd24d35c0fe9121bde62f0c4da92f8
DIFF: https://github.com/llvm/llvm-project/commit/6dbd458128dd24d35c0fe9121bde62f0c4da92f8.diff

LOG: AMDGPU: Remove pointless libcall optimization of fma/mad

After the library is linked and trivially inlined, the generic fma and
fmuladd intrinsics already handle these cases, and with precise flag
handling. This was requiring all fast math flags when we really just
need nsz for the fma(a, b, 0) case.

https://reviews.llvm.org/D156677

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 6beab66635a703..6f96c71c830f49 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -71,9 +71,6 @@ class AMDGPULibCalls {
   // rootn
   bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
-  // fma/mad
-  bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
   // -fuse-native for sincos
   bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
 
@@ -581,11 +578,6 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
     case AMDGPULibFunc::EI_COS:
     case AMDGPULibFunc::EI_SIN:
       return fold_sincos(FPOp, B, FInfo);
-    case AMDGPULibFunc::EI_FMA:
-    case AMDGPULibFunc::EI_MAD:
-    case AMDGPULibFunc::EI_NFMA:
-      // skip vector function
-      return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
     default:
       break;
     }
@@ -1016,50 +1008,6 @@ bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
   return false;
 }
 
-bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
-                                  const FuncInfo &FInfo) {
-  Value *opr0 = CI->getArgOperand(0);
-  Value *opr1 = CI->getArgOperand(1);
-  Value *opr2 = CI->getArgOperand(2);
-
-  ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
-  ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
-  if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
-    // fma/mad(a, b, c) = c if a=0 || b=0
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
-    replaceCall(CI, opr2);
-    return true;
-  }
-  if (CF0 && CF0->isExactlyValue(1.0f)) {
-    // fma/mad(a, b, c) = b+c if a=1
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
-                      << "\n");
-    Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
-    replaceCall(CI, nval);
-    return true;
-  }
-  if (CF1 && CF1->isExactlyValue(1.0f)) {
-    // fma/mad(a, b, c) = a+c if b=1
-    LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
-                      << "\n");
-    Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
-    replaceCall(CI, nval);
-    return true;
-  }
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
-    if (CF->isZero()) {
-      // fma/mad(a, b, c) = a*b if c=0
-      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
-                        << *opr1 << "\n");
-      Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
-      replaceCall(CI, nval);
-      return true;
-    }
-  }
-
-  return false;
-}
-
 // Get a scalar native builtin single argument FP function
 FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
                                                  const FuncInfo &FInfo) {

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 2d3e7c336c1e9a..142107e43ebc9e 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -471,7 +471,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -483,7 +483,7 @@ entry:
 declare float @_Z3fmafff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
 define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -493,7 +493,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -505,7 +505,7 @@ entry:
 declare float @_Z3madfff(float, float, float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
-; GCN: store float %y, ptr addrspace(1) %a
+; GCN: %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
 define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -515,7 +515,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
-; GCN: %fmaadd = fadd fast float %tmp, %y
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
 define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -525,7 +525,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
-; GCN: %fmaadd = fadd fast float %tmp, %y
+; GCN: %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
 define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -535,7 +535,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
-; GCN: %fmamul = fmul fast float %tmp1, %tmp
+; GCN: %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
 define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1